diff --git a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu index 946927d88a1ee1..3ca766755a6319 100644 --- a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu +++ b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu @@ -10,7 +10,7 @@ // GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope // GFX90A-CAS-LABEL: _Z14atomic_add_casPf // GFX90A-CAS: flat_atomic_cmpswap -// GFX90A-CAS: s_cbranch_execnz +// GFX90A-CAS: s_cbranch_scc1 __device__ float atomic_add_cas(float *p) { return __atomic_fetch_add(p, 1.0f, memory_order_relaxed); } diff --git a/llvm/include/llvm/Analysis/CFGPrinter.h b/llvm/include/llvm/Analysis/CFGPrinter.h index cd785331d1f146..e24a9110d596ca 100644 --- a/llvm/include/llvm/Analysis/CFGPrinter.h +++ b/llvm/include/llvm/Analysis/CFGPrinter.h @@ -272,9 +272,11 @@ struct DOTGraphTraits : public DefaultDOTGraphTraits { unsigned OpNo = I.getSuccessorIndex(); const Instruction *TI = Node->getTerminator(); BasicBlock *SuccBB = TI->getSuccessor(OpNo); - auto BranchProb = CFGInfo->getBPI()->getEdgeProbability(Node, SuccBB); - double WeightPercent = ((double)BranchProb.getNumerator()) / - ((double)BranchProb.getDenominator()); + // auto BranchProb = CFGInfo->getBPI()->getEdgeProbability(Node, SuccBB); + // double WeightPercent = ((double)BranchProb.getNumerator()) / + // ((double)BranchProb.getDenominator()); + double WeightPercent = 0.5; + std::string TTAttr = formatv("tooltip=\"{0} -> {1}\\nProbability {2:P}\" ", getBBName(Node), getBBName(SuccBB), WeightPercent); diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h index 6efb17c55493a9..9fcda791fb4c72 100644 --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -879,8 +879,7 @@ class MachineBasicBlock /// debug. This is the correct point to insert copies at the beginning of a /// basic block. \p Reg is the register being used by a spill or defined for a /// restore/split during register allocation. - iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg = Register(), - bool SkipPseudoOp = true); + iterator SkipPHIsLabelsAndDebug(iterator I, bool SkipPseudoOp = true); /// Returns an iterator to the first terminator instruction of this basic /// block. If a terminator does not exist, it returns end(). diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index 49ce13dd8cbe39..984850980f4c9c 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -2058,8 +2058,7 @@ class TargetInstrInfo : public MCInstrInfo { /// other instructions shall be inserted before it. This can be implemented /// to prevent register allocator to insert spills for \p Reg before such /// instructions. - virtual bool isBasicBlockPrologue(const MachineInstr &MI, - Register Reg = Register()) const { + virtual bool isBasicBlockPrologue(const MachineInstr &MI) const { return false; } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index dc13a35c66f9ab..195ceb64eae4a8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3098,7 +3098,7 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] >; -def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], +def int_amdgcn_wave_reconverge : Intrinsic<[], [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]>; // Represent unreachable in a divergent region. diff --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp index 3bb9da5f1a37bb..184b493694894d 100644 --- a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp +++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp @@ -461,8 +461,7 @@ class StatepointState { if (EHPad && !RC.hasReload(Reg, RegToSlotIdx[Reg], EHPad)) { RC.recordReload(Reg, RegToSlotIdx[Reg], EHPad); - auto EHPadInsertPoint = - EHPad->SkipPHIsLabelsAndDebug(EHPad->begin(), Reg); + auto EHPadInsertPoint = EHPad->SkipPHIsLabelsAndDebug(EHPad->begin()); insertReloadBefore(Reg, EHPadInsertPoint, EHPad); LLVM_DEBUG(dbgs() << "...also reload at EHPad " << printMBBReference(*EHPad) << "\n"); diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index 81ae805d64e1ec..201d3a5df3a536 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -463,7 +463,7 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI, MachineBasicBlock *MBB = LIS.getMBBFromIndex(SrcVNI->def); MachineBasicBlock::iterator MII; if (SrcVNI->isPHIDef()) - MII = MBB->SkipPHIsLabelsAndDebug(MBB->begin(), SrcReg); + MII = MBB->SkipPHIsLabelsAndDebug(MBB->begin()); else { MachineInstr *DefMI = LIS.getInstructionFromIndex(SrcVNI->def); assert(DefMI && "Defining instruction disappeared"); diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index 5d06af3ebf3360..419d7e0312ae08 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -223,13 +223,13 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) { MachineBasicBlock::iterator MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I, - Register Reg, bool SkipPseudoOp) { + bool SkipPseudoOp) { const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo(); iterator E = end(); while (I != E && (I->isPHI() || I->isPosition() || I->isDebugInstr() || (SkipPseudoOp && I->isPseudoProbe()) || - TII->isBasicBlockPrologue(*I, Reg))) + TII->isBasicBlockPrologue(*I))) ++I; // FIXME: This needs to change if we wish to bundle labels / dbg_values // inside the bundle. diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp index b671e510387530..22991a0fb4cb1e 100644 --- a/llvm/lib/CodeGen/SplitKit.cpp +++ b/llvm/lib/CodeGen/SplitKit.cpp @@ -806,10 +806,8 @@ SlotIndex SplitEditor::leaveIntvAtTop(MachineBasicBlock &MBB) { return Start; } - unsigned RegIdx = 0; - Register Reg = LIS.getInterval(Edit->get(RegIdx)).reg(); - VNInfo *VNI = defFromParent(RegIdx, ParentVNI, Start, MBB, - MBB.SkipPHIsLabelsAndDebug(MBB.begin(), Reg)); + VNInfo *VNI = defFromParent(0, ParentVNI, Start, MBB, + MBB.SkipPHIsLabelsAndDebug(MBB.begin())); RegAssign.insert(Start, VNI->def, OpenIdx); LLVM_DEBUG(dump()); return VNI->def; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3fcb364fc2c536..c0d2853d159882 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1551,11 +1551,12 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { return true; } -bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { +bool AMDGPUInstructionSelector::selectWaveReconvergeIntrinsic( + MachineInstr &MI) const { // FIXME: Manually selecting to avoid dealing with the SReg_1 trick // SelectionDAG uses for wave32 vs wave64. MachineBasicBlock *BB = MI.getParent(); - BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) + BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_WAVE_RECONVERGE)) .add(MI.getOperand(1)); Register Reg = MI.getOperand(1).getReg(); @@ -2083,8 +2084,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( MachineInstr &I) const { Intrinsic::ID IntrinsicID = cast(I).getIntrinsicID(); switch (IntrinsicID) { - case Intrinsic::amdgcn_end_cf: - return selectEndCfIntrinsic(I); + case Intrinsic::amdgcn_wave_reconverge: + return selectWaveReconvergeIntrinsic(I); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: return selectDSOrderedIntrinsic(I, IntrinsicID); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 068db5c1c14496..c3ba26590dfbcf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -116,7 +116,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectReturnAddress(MachineInstr &I) const; bool selectG_INTRINSIC(MachineInstr &I) const; - bool selectEndCfIntrinsic(MachineInstr &MI) const; + bool selectWaveReconvergeIntrinsic(MachineInstr &MI) const; bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 4737a322c255f4..1d2ee6a4c96514 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -785,8 +785,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); const unsigned MovExecOpc = Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - const unsigned MovExecTermOpc = - Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; const unsigned XorTermOpc = Subtarget.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; @@ -949,27 +947,27 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( B.setInsertPt(*BodyBB, BodyBB->end()); + Register LoopMask = MRI.createVirtualRegister( + TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); // Update EXEC, switch all done bits to 0 and all todo bits to 1. B.buildInstr(XorTermOpc) - .addDef(ExecReg) - .addReg(ExecReg) - .addReg(NewExec); + .addDef(LoopMask) + .addReg(ExecReg) + .addReg(NewExec); // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use // s_cbranch_scc0? // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. - B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); + B.buildInstr(AMDGPU::SI_WATERFALL_LOOP) + .addReg(LoopMask) + .addReg(NewExec) + .addMBB(LoopBB); // Save the EXEC mask before the loop. BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) .addReg(ExecReg); - // Restore the EXEC mask after the loop. - B.setMBB(*RestoreExecBB); - B.buildInstr(MovExecTermOpc) - .addDef(ExecReg) - .addReg(SaveExecReg); // Set the insert point after the original instruction, so any new // instructions will be in the remainder. @@ -4967,7 +4965,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } - case Intrinsic::amdgcn_end_cf: { + case Intrinsic::amdgcn_wave_reconverge: { unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index edd881c84078c6..cd8cbcc7f689d4 100644 --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -14,6 +14,7 @@ #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" #include "GCNSubtarget.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -55,7 +56,7 @@ class SIAnnotateControlFlow { Function *Else; Function *IfBreak; Function *Loop; - Function *EndCf; + Function *WaveReconverge; DominatorTree *DT; StackVector Stack; @@ -88,7 +89,7 @@ class SIAnnotateControlFlow { bool handleLoop(BranchInst *Term); - bool closeControlFlow(BasicBlock *BB); + bool tryWaveReconverge(BasicBlock *BB); public: SIAnnotateControlFlow(Module &M, const GCNSubtarget &ST, DominatorTree &DT, @@ -123,7 +124,8 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) { IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break, { IntMask }); Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask }); - EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask }); + WaveReconverge = Intrinsic::getDeclaration( + &M, Intrinsic::amdgcn_wave_reconverge, {IntMask}); } /// Is the branch condition uniform or did the StructurizeCFG pass @@ -185,8 +187,6 @@ bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { /// Open a new "If" block bool SIAnnotateControlFlow::openIf(BranchInst *Term) { - if (isUniform(Term)) - return false; IRBuilder<> IRB(Term); Value *IfCall = IRB.CreateCall(If, {Term->getCondition()}); @@ -287,43 +287,43 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) { } /// Close the last opened control flow -bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { - llvm::Loop *L = LI->getLoopFor(BB); - - assert(Stack.back().first == BB); +bool SIAnnotateControlFlow::tryWaveReconverge(BasicBlock *BB) { - if (L && L->getHeader() == BB) { - // We can't insert an EndCF call into a loop header, because it will - // get executed on every iteration of the loop, when it should be - // executed only once before the loop. - SmallVector Latches; - L->getLoopLatches(Latches); + if (succ_empty(BB)) + return false; + BranchInst *Term = dyn_cast(BB->getTerminator()); + if (Term->getNumSuccessors() == 1) { + // The current BBs single successor is a top of the stack. We need to + // reconverge over thaqt path. + BasicBlock *SingleSucc = *succ_begin(BB); + BasicBlock::iterator InsPt = Term ? BasicBlock::iterator(Term) : BB->end(); + if (isTopOfStack(SingleSucc)) { + Value *Exec = Stack.back().second; + IRBuilder<>(BB, InsPt).CreateCall(WaveReconverge, {Exec}); + } + } else { + // We have a uniform conditional branch terminating the block. + // THis block may be the last in the Then path of the enclosing divergent + // IF. + if (!isUniform(Term)) + // Divergent loop is going to be further processed in another place + return false; + + for (auto Succ : Term->successors()) { + if (isTopOfStack(Succ)) { + // Just split to make a room for further WAVE_RECONVERGE insertion SmallVector Preds; - for (BasicBlock *Pred : predecessors(BB)) { - if (!is_contained(Latches, Pred)) - Preds.push_back(Pred); + for (auto P : predecessors(Succ)) { + if (DT->dominates(BB, P)) + Preds.push_back(P); } - - BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr, + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + SplitBlockPredecessors(Succ, Preds, ".reconverge", &DTU, LI, nullptr, false); } - Value *Exec = popSaved(); - BasicBlock::iterator FirstInsertionPt = BB->getFirstInsertionPt(); - if (!isa(Exec) && !isa(FirstInsertionPt)) { - Instruction *ExecDef = cast(Exec); - BasicBlock *DefBB = ExecDef->getParent(); - if (!DT->dominates(DefBB, BB)) { - // Split edge to make Def dominate Use - FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt(); } - IRBuilder<> IRB(FirstInsertionPt->getParent(), FirstInsertionPt); - // TODO: StructurizeCFG 'Flow' blocks have debug locations from the - // condition, for now just avoid copying these DebugLocs so that stepping - // out of the then/else block in a debugger doesn't step to the condition. - IRB.SetCurrentDebugLocation(DebugLoc()); - IRB.CreateCall(EndCf, {Exec}); } return true; @@ -341,14 +341,18 @@ bool SIAnnotateControlFlow::run(Function &F) { if (!Term || Term->isUnconditional()) { if (isTopOfStack(BB)) - Changed |= closeControlFlow(BB); + Stack.pop_back(); + Changed |= tryWaveReconverge(BB); continue; } if (I.nodeVisited(Term->getSuccessor(1))) { if (isTopOfStack(BB)) - Changed |= closeControlFlow(BB); + Stack.pop_back(); + // Let's take care of uniform loop latch that may be closing the Then + // path of the enclosing divergent branch. + Changed |= tryWaveReconverge(BB); if (DT->dominates(Term->getSuccessor(1), BB)) Changed |= handleLoop(Term); @@ -363,9 +367,14 @@ bool SIAnnotateControlFlow::run(Function &F) { continue; } - Changed |= closeControlFlow(BB); + Stack.pop_back(); } + if (isUniform(Term)) + // Uniform conditional branch may be in the block that closes the Then + // path of the divergent conditional branch. + Changed |= tryWaveReconverge(BB); + else Changed |= openIf(Term); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 81b52935ddf397..ae3b849a55ff2e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6475,7 +6475,7 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { return AMDGPUISD::ELSE; case Intrinsic::amdgcn_loop: return AMDGPUISD::LOOP; - case Intrinsic::amdgcn_end_cf: + case Intrinsic::amdgcn_wave_reconverge: llvm_unreachable("should not occur"); default: return 0; @@ -9848,9 +9848,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(Load, 0); } - case Intrinsic::amdgcn_end_cf: - return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, - Op->getOperand(2), Chain), 0); + case Intrinsic::amdgcn_wave_reconverge: + return SDValue(DAG.getMachineNode(AMDGPU::SI_WAVE_RECONVERGE, DL, + MVT::Other, Op->getOperand(2), Chain), + 0); case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_wakeup_barrier: { @@ -15693,6 +15694,28 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { } } + // ISel inserts copy to regs for the successor PHIs + // at the BB end. We need to move the SI_WAVE_RECONVERGE right before the + // branch. + for (auto &MBB : MF) { + for (auto &MI : MBB) { + if (MI.getOpcode() == AMDGPU::SI_WAVE_RECONVERGE) { + MachineBasicBlock::iterator I(MI); + MachineBasicBlock::iterator Next = std::next(I); + bool NeedToMove = false; + while (Next != MBB.end() && !Next->isBranch()) { + NeedToMove = true; + Next++; + } + assert((Next == MBB.end() || !Next->readsRegister(AMDGPU::SCC, TRI)) && + "Malformed CFG detected!\n"); + if (NeedToMove) { + MBB.splice(Next, &MBB, &MI); + } + break; + } + } + } // FIXME: This is a hack to fixup AGPR classes to use the properly aligned // classes if required. Ideally the register class constraints would differ // per-subtarget, but there's no easy way to achieve that right now. This is @@ -16451,7 +16474,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet &Visited, default: Result = false; break; - case Intrinsic::amdgcn_end_cf: + case Intrinsic::amdgcn_wave_reconverge: case Intrinsic::amdgcn_loop: Result = true; break; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index a857bdba53c3e8..2a4d08a28b258e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2110,11 +2110,29 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(AMDGPU::S_MOV_B64)); break; + case AMDGPU::S_CMOV_B64_term: + MI.setDesc(get(AMDGPU::S_CMOV_B64)); + break; + case AMDGPU::S_CMP_LG_U64_term: + MI.setDesc(get(AMDGPU::S_CMP_LG_U64)); + break; case AMDGPU::S_MOV_B32_term: // This is only a terminator to get the correct spill code placement during // register allocation. MI.setDesc(get(AMDGPU::S_MOV_B32)); break; + case AMDGPU::S_CMOV_B32_term: + MI.setDesc(get(AMDGPU::S_CMOV_B32)); + break; + case AMDGPU::S_CMP_LG_U32_term: + MI.setDesc(get(AMDGPU::S_CMP_LG_U32)); + break; + case AMDGPU::S_CSELECT_B32_term: + MI.setDesc(get(AMDGPU::S_CSELECT_B32)); + break; + case AMDGPU::S_CSELECT_B64_term: + MI.setDesc(get(AMDGPU::S_CSELECT_B64)); + break; case AMDGPU::S_XOR_B64_term: // This is only a terminator to get the correct spill code placement during @@ -3082,20 +3100,27 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, while (I != E && !I->isBranch() && !I->isReturn()) { switch (I->getOpcode()) { case AMDGPU::S_MOV_B64_term: + case AMDGPU::S_CMOV_B64_term: + case AMDGPU::S_CMP_LG_U64_term: case AMDGPU::S_XOR_B64_term: case AMDGPU::S_OR_B64_term: case AMDGPU::S_ANDN2_B64_term: case AMDGPU::S_AND_B64_term: case AMDGPU::S_AND_SAVEEXEC_B64_term: + case AMDGPU::S_CSELECT_B64_term: case AMDGPU::S_MOV_B32_term: + case AMDGPU::S_CMOV_B32_term: + case AMDGPU::S_CMP_LG_U32_term: case AMDGPU::S_XOR_B32_term: case AMDGPU::S_OR_B32_term: case AMDGPU::S_ANDN2_B32_term: case AMDGPU::S_AND_B32_term: case AMDGPU::S_AND_SAVEEXEC_B32_term: + case AMDGPU::S_CSELECT_B32_term: break; case AMDGPU::SI_IF: case AMDGPU::SI_ELSE: + case AMDGPU::SI_WAVE_RECONVERGE: case AMDGPU::SI_KILL_I1_TERMINATOR: case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: // FIXME: It's messy that these need to be considered here at all. @@ -6268,6 +6293,9 @@ static void emitLoadScalarOpsFromVGPRLoop( ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; unsigned AndOpc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; +#ifndef NDEBUG + unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; +#endif const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); MachineBasicBlock::iterator I = LoopBB.begin(); @@ -6376,6 +6404,7 @@ static void emitLoadScalarOpsFromVGPRLoop( } Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); + Register LoopMask = MRI.createVirtualRegister(BoolXExecRC); MRI.setSimpleHint(SaveExec, CondReg); // Update EXEC to matching lanes, saving original to SaveExec. @@ -6386,11 +6415,17 @@ static void emitLoadScalarOpsFromVGPRLoop( I = BodyBB.end(); // Update EXEC, switch all done bits to 0 and all todo bits to 1. - BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec) + BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), LoopMask) .addReg(Exec) .addReg(SaveExec); - BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); + MachineInstr *ExitExecDef = &*OrigBB.getLastNonDebugInstr(false); + assert(ExitExecDef != OrigBB.end() && ExitExecDef->getOpcode() == MovExecOpc); + Register ExitExec = ExitExecDef->getOperand(0).getReg(); + BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)) + .addReg(LoopMask) + .addReg(ExitExec) + .addMBB(&LoopBB); } // Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register @@ -6497,8 +6532,6 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, .addImm(0); } - // Restore the EXEC mask - BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); return BodyBB; } @@ -8840,26 +8873,10 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg, return AMDGPU::COPY; } -bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI, - Register Reg) const { - // We need to handle instructions which may be inserted during register - // allocation to handle the prolog. The initial prolog instruction may have - // been separated from the start of the block by spills and copies inserted - // needed by the prolog. However, the insertions for scalar registers can - // always be placed at the BB top as they are independent of the exec mask - // value. - bool IsNullOrVectorRegister = true; - if (Reg) { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)); - } +bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { - uint16_t Opcode = MI.getOpcode(); - // FIXME: Copies inserted in the block prolog for live-range split should also - // be included. - return IsNullOrVectorRegister && - (isSpill(Opcode) || (!MI.isTerminator() && Opcode != AMDGPU::COPY && - MI.modifiesRegister(AMDGPU::EXEC, &RI))); + return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && + MI.modifiesRegister(AMDGPU::EXEC, &RI); } MachineInstrBuilder diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 4fd9b4366159be..d47e5680b83e8d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1334,8 +1334,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override; - bool isBasicBlockPrologue(const MachineInstr &MI, - Register Reg = Register()) const override; + bool isBasicBlockPrologue(const MachineInstr &MI) const override; MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 69e1b9a38324f2..4d742b771891e8 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -341,6 +341,9 @@ class WrapTerminatorInst : SPseudoInstSI< let WaveSizePredicate = isWave64 in { def S_MOV_B64_term : WrapTerminatorInst; +def S_CMOV_B64_term : WrapTerminatorInst; +def S_CSELECT_B64_term : WrapTerminatorInst; +def S_CMP_LG_U64_term : WrapTerminatorInst; def S_XOR_B64_term : WrapTerminatorInst; def S_OR_B64_term : WrapTerminatorInst; def S_ANDN2_B64_term : WrapTerminatorInst; @@ -350,6 +353,9 @@ def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst; let WaveSizePredicate = isWave32 in { def S_MOV_B32_term : WrapTerminatorInst; +def S_CMOV_B32_term : WrapTerminatorInst; +def S_CSELECT_B32_term : WrapTerminatorInst; +def S_CMP_LG_U32_term : WrapTerminatorInst; def S_XOR_B32_term : WrapTerminatorInst; def S_OR_B32_term : WrapTerminatorInst; def S_ANDN2_B32_term : WrapTerminatorInst; @@ -442,9 +448,10 @@ def SI_ELSE : CFPseudoInstSI < def SI_WATERFALL_LOOP : CFPseudoInstSI < (outs), - (ins brtarget:$target), [], 1> { + (ins SReg_1:$LoopMask, SReg_1:$ExitMask, brtarget:$target), [], 1> { let Size = 8; let isBranch = 1; + let Uses = [SCC]; let Defs = []; } @@ -457,9 +464,8 @@ def SI_LOOP : CFPseudoInstSI < let IsNeverUniform = 1; } -} // End isTerminator = 1 -def SI_END_CF : CFPseudoInstSI < +def SI_WAVE_RECONVERGE : CFPseudoInstSI < (outs), (ins SReg_1:$saved), [], 1, 1> { let Size = 4; let isAsCheapAsAMove = 1; @@ -469,6 +475,7 @@ def SI_END_CF : CFPseudoInstSI < let mayLoad = 1; // FIXME: Should not need memory flags let mayStore = 1; } +} // End isTerminator = 1 def SI_IF_BREAK : CFPseudoInstSI < (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> { diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 99c7d2b306789a..6380f8e3ec7736 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -25,7 +25,7 @@ /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 /// %sgpr0 = SI_ELSE %sgpr0 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0 -/// SI_END_CF %sgpr0 +/// SI_WAVE_RECONVERGE %sgpr0 /// /// becomes: /// @@ -76,13 +76,19 @@ class SILowerControlFlow : public MachineFunctionPass { LiveVariables *LV = nullptr; MachineDominatorTree *MDT = nullptr; MachineRegisterInfo *MRI = nullptr; - SetVector LoweredEndCf; + SetVector LoweredWaveReconverge; DenseSet LoweredIf; SmallSet KillBlocks; SmallSet RecomputeRegs; const TargetRegisterClass *BoolRC = nullptr; + uint64_t TestMask; + unsigned Select; + unsigned CmovOpc; + unsigned CmpOpc; unsigned AndOpc; + unsigned AndTermOpc; + unsigned Andn2Opc; unsigned OrOpc; unsigned XorOpc; unsigned MovTermOpc; @@ -100,25 +106,21 @@ class SILowerControlFlow : public MachineFunctionPass { void emitElse(MachineInstr &MI); void emitIfBreak(MachineInstr &MI); void emitLoop(MachineInstr &MI); + void emitWaterfallLoop(MachineInstr &MI); + void emitWaveDiverge(MachineInstr &MI, Register EnabledLanesMask, + Register DisableLanesMask, bool IsIf); - MachineBasicBlock *emitEndCf(MachineInstr &MI); + void emitWaveReconverge(MachineInstr &MI); void findMaskOperands(MachineInstr &MI, unsigned OpNo, SmallVectorImpl &Src) const; void combineMasks(MachineInstr &MI); - bool removeMBBifRedundant(MachineBasicBlock &MBB); + bool needMaskWithExec(const MachineInstr &MI) const; MachineBasicBlock *process(MachineInstr &MI); - // Skip to the next instruction, ignoring debug instructions, and trivial - // block boundaries (blocks that have one (typically fallthrough) successor, - // and the successor has one predecessor. - MachineBasicBlock::iterator - skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It) const; - /// Find the insertion point for a new conditional branch. MachineBasicBlock::iterator skipToUncondBrOrEnd(MachineBasicBlock &MBB, @@ -132,9 +134,6 @@ class SILowerControlFlow : public MachineFunctionPass { return I; } - // Remove redundant SI_END_CF instructions. - void optimizeEndCf(); - public: static char ID; @@ -164,12 +163,7 @@ char SILowerControlFlow::ID = 0; INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, "SI lower control flow", false, false) -static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { - MachineOperand &ImpDefSCC = MI.getOperand(3); - assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); - ImpDefSCC.setIsDead(IsDead); -} char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; @@ -198,7 +192,7 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { if (U == MRI->use_instr_nodbg_end() || std::next(U) != MRI->use_instr_nodbg_end() || - U->getOpcode() != AMDGPU::SI_END_CF) + U->getOpcode() != AMDGPU::SI_WAVE_RECONVERGE) return false; return true; @@ -208,161 +202,62 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); - Register SaveExecReg = MI.getOperand(0).getReg(); + Register MaskElse = MI.getOperand(0).getReg(); MachineOperand& Cond = MI.getOperand(1); assert(Cond.getSubReg() == AMDGPU::NoSubRegister); + Register CondReg = Cond.getReg(); + if (!needMaskWithExec(MI)) + return emitWaveDiverge(MI, CondReg, MaskElse, true); - MachineOperand &ImpDefSCC = MI.getOperand(4); - assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); + Register MaskThen = MRI->createVirtualRegister(BoolRC); + // Get rid of the garbage bits in the Cond register which might be coming from - // If there is only one use of save exec register and that use is SI_END_CF, - // we can optimize SI_IF by returning the full saved exec mask instead of - // just cleared bits. - bool SimpleIf = isSimpleIf(MI, MRI); + // the bitwise arithmetic when one of the expression operands is coming from - if (SimpleIf) { - // Check for SI_KILL_*_TERMINATOR on path from if to endif. - // if there is any such terminator simplifications are not safe. - auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); - SimpleIf = !hasKill(MI.getParent(), UseMI->getParent()); - } + // the outer scope and hence having extra bits set. - // Add an implicit def of exec to discourage scheduling VALU after this which - // will interfere with trying to form s_and_saveexec_b64 later. - Register CopyReg = SimpleIf ? SaveExecReg - : MRI->createVirtualRegister(BoolRC); - MachineInstr *CopyExec = - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) - .addReg(Exec) - .addReg(Exec, RegState::ImplicitDefine); - LoweredIf.insert(CopyReg); - - Register Tmp = MRI->createVirtualRegister(BoolRC); - - MachineInstr *And = - BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp) - .addReg(CopyReg) - .add(Cond); + + MachineInstr *CondFiltered = + BuildMI(MBB, I, DL, TII->get(AndOpc), MaskThen).add(Cond).addReg(Exec); if (LV) - LV->replaceKillInstruction(Cond.getReg(), MI, *And); + LV->replaceKillInstruction(CondReg, MI, *CondFiltered); - setImpSCCDefDead(*And, true); + emitWaveDiverge(MI, MaskThen, MaskElse, true); - MachineInstr *Xor = nullptr; - if (!SimpleIf) { - Xor = - BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg) - .addReg(Tmp) - .addReg(CopyReg); - setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); - } - // Use a copy that is a terminator to get correct spill code placement it with - // fast regalloc. - MachineInstr *SetExec = - BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec) - .addReg(Tmp, RegState::Kill); - if (LV) - LV->getVarInfo(Tmp).Kills.push_back(SetExec); + if (LIS) { + LIS->InsertMachineInstrInMaps(*CondFiltered); + LIS->createAndComputeVirtRegInterval(MaskThen); - // Skip ahead to the unconditional branch in case there are other terminators - // present. - I = skipToUncondBrOrEnd(MBB, I); - // Insert the S_CBRANCH_EXECZ instruction which will be optimized later - // during SIRemoveShortExecBranches. - MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .add(MI.getOperand(2)); - if (!LIS) { - MI.eraseFromParent(); - return; } - LIS->InsertMachineInstrInMaps(*CopyExec); - // Replace with and so we don't need to fix the live interval for condition - // register. - LIS->ReplaceMachineInstrInMaps(MI, *And); - if (!SimpleIf) - LIS->InsertMachineInstrInMaps(*Xor); - LIS->InsertMachineInstrInMaps(*SetExec); - LIS->InsertMachineInstrInMaps(*NewBr); - LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); - MI.eraseFromParent(); - // FIXME: Is there a better way of adjusting the liveness? It shouldn't be - // hard to add another def here but I'm not sure how to correctly update the - // valno. - RecomputeRegs.insert(SaveExecReg); - LIS->createAndComputeVirtRegInterval(Tmp); - if (!SimpleIf) - LIS->createAndComputeVirtRegInterval(CopyReg); } void SILowerControlFlow::emitElse(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); + Register InvCondReg = MI.getOperand(0).getReg(); + Register CondReg = MI.getOperand(1).getReg(); - MachineBasicBlock::iterator Start = MBB.begin(); - // This must be inserted before phis and any spill code inserted before the - // else. - Register SaveReg = MRI->createVirtualRegister(BoolRC); - MachineInstr *OrSaveExec = - BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg) - .add(MI.getOperand(1)); // Saved EXEC - if (LV) - LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec); - MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); - MachineBasicBlock::iterator ElsePt(MI); - // This accounts for any modification of the EXEC mask within the block and - // can be optimized out pre-RA when not required. - MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg) - .addReg(Exec) - .addReg(SaveReg); - MachineInstr *Xor = - BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec) - .addReg(Exec) - .addReg(DstReg); - // Skip ahead to the unconditional branch in case there are other terminators - // present. - ElsePt = skipToUncondBrOrEnd(MBB, ElsePt); - MachineInstr *Branch = - BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addMBB(DestBB); - if (!LIS) { - MI.eraseFromParent(); - return; - } - LIS->RemoveMachineInstrFromMaps(MI); - MI.eraseFromParent(); - LIS->InsertMachineInstrInMaps(*OrSaveExec); - LIS->InsertMachineInstrInMaps(*And); - LIS->InsertMachineInstrInMaps(*Xor); - LIS->InsertMachineInstrInMaps(*Branch); - RecomputeRegs.insert(SrcReg); - RecomputeRegs.insert(DstReg); - LIS->createAndComputeVirtRegInterval(SaveReg); + emitWaveDiverge(MI, CondReg, InvCondReg, false); - // Let this be recomputed. - LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); } void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { @@ -374,19 +269,12 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { // because it is a V_CMP in the same basic block. (We know the break // condition operand was an i1 in IR, so if it is a VALU instruction it must // be one with a carry-out.) - bool SkipAnding = false; - if (MI.getOperand(1).isReg()) { - if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) { - SkipAnding = Def->getParent() == MI.getParent() - && SIInstrInfo::isVALU(*Def); - } - } // AND the break condition operand with exec, then OR that into the "loop // exit" mask. MachineInstr *And = nullptr, *Or = nullptr; Register AndReg; - if (!SkipAnding) { + if (needMaskWithExec(MI)) { AndReg = MRI->createVirtualRegister(BoolRC); And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg) .addReg(Exec) @@ -423,141 +311,198 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - MachineInstr *AndN2 = - BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec) + Register Cond = MI.getOperand(0).getReg(); + Register MaskLoop = MRI->createVirtualRegister(BoolRC); + Register AndZero = MRI->createVirtualRegister(BoolRC); + MachineInstr *CondLoop = BuildMI(MBB, &MI, DL, TII->get(Andn2Opc), MaskLoop) .addReg(Exec) - .add(MI.getOperand(0)); + .addReg(Cond); + MachineInstr *SetExec = BuildMI(MBB, &MI, DL, TII->get(Select), Exec) + .addReg(MaskLoop) + .addReg(Cond); if (LV) - LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *AndN2); + LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *SetExec); auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator()); MachineInstr *Branch = - BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) .add(MI.getOperand(1)); if (LIS) { RecomputeRegs.insert(MI.getOperand(0).getReg()); - LIS->ReplaceMachineInstrInMaps(MI, *AndN2); + LIS->ReplaceMachineInstrInMaps(MI, *SetExec); + LIS->InsertMachineInstrInMaps(*CondLoop); LIS->InsertMachineInstrInMaps(*Branch); + LIS->createAndComputeVirtRegInterval(MaskLoop); + LIS->createAndComputeVirtRegInterval(AndZero); } MI.eraseFromParent(); } -MachineBasicBlock::iterator -SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( - MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { +void SILowerControlFlow::emitWaterfallLoop(MachineInstr &MI) { + Register LoopMask = MI.getOperand(0).getReg(); + Register ExitMask = MI.getOperand(1).getReg(); + MachineBasicBlock *LoopBB = MI.getOperand(2).getMBB(); - SmallSet Visited; - MachineBasicBlock *B = &MBB; - do { - if (!Visited.insert(B).second) - return MBB.end(); + MachineBasicBlock *BodyBB = MI.getParent(); + MachineBasicBlock::iterator I = BodyBB->end(); + const DebugLoc DL = MI.getDebugLoc(); - auto E = B->end(); - for ( ; It != E; ++It) { - if (TII->mayReadEXEC(*MRI, *It)) - break; + MachineInstr *UpdateExec = BuildMI(*BodyBB, I, DL, TII->get(Select), Exec) + .addReg(LoopMask) + .addReg(ExitMask); + + MachineInstr *Branch = + BuildMI(*BodyBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)).addMBB(LoopBB); + + if (LIS) { + RecomputeRegs.insert(MI.getOperand(0).getReg()); + RecomputeRegs.insert(MI.getOperand(1).getReg()); + LIS->ReplaceMachineInstrInMaps(MI, *UpdateExec); + LIS->InsertMachineInstrInMaps(*Branch); } - if (It != E) - return It; - if (B->succ_size() != 1) - return MBB.end(); + MI.eraseFromParent(); - // If there is one trivial successor, advance to the next block. - MachineBasicBlock *Succ = *B->succ_begin(); - It = Succ->begin(); - B = Succ; - } while (true); } -MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) { +void SILowerControlFlow::emitWaveDiverge(MachineInstr &MI, + Register EnabledLanesMask, + Register DisableLanesMask, bool IsIf) { MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction *MF = MBB.getParent(); + const GCNSubtarget &ST = MF->getSubtarget(); + bool IsWave32 = ST.isWave32(); const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock::iterator InsPt = MBB.begin(); - - // If we have instructions that aren't prolog instructions, split the block - // and emit a terminator instruction. This ensures correct spill placement. - // FIXME: We should unconditionally split the block here. - bool NeedBlockSplit = false; - Register DataReg = MI.getOperand(0).getReg(); - for (MachineBasicBlock::iterator I = InsPt, E = MI.getIterator(); - I != E; ++I) { - if (I->modifiesRegister(DataReg, TRI)) { - NeedBlockSplit = true; - break; + MachineBasicBlock::iterator I(MI); + + bool NeedXor = true; + if (IsIf) { + // If there is only one use of save exec register and that use is + // SI_END_CF, we can optimize SI_IF by returning the full saved exec mask + // instead of just cleared bits. + bool SimpleIf = isSimpleIf(MI, MRI); + if (SimpleIf) { + // Check for SI_KILL_*_TERMINATOR on path from if to endif. + // if there is any such terminator simplifications are not safe. + auto UseMI = MRI->use_instr_nodbg_begin(DisableLanesMask); + SimpleIf = !hasKill(MI.getParent(), UseMI->getParent()); } + NeedXor = !SimpleIf; } - unsigned Opcode = OrOpc; - MachineBasicBlock *SplitBB = &MBB; - if (NeedBlockSplit) { - SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS); - if (MDT && SplitBB != &MBB) { - MachineDomTreeNode *MBBNode = (*MDT)[&MBB]; - SmallVector Children(MBBNode->begin(), - MBBNode->end()); - MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB); - for (MachineDomTreeNode *Child : Children) - MDT->changeImmediateDominator(Child, SplitBBNode); + if (NeedXor) { + + MachineInstr *CondInverted = + BuildMI(MBB, I, DL, TII->get(XorOpc), DisableLanesMask) + .addReg(EnabledLanesMask) + .addReg(Exec); + if (LV) { + LV->replaceKillInstruction(DisableLanesMask, MI, *CondInverted); } - Opcode = OrTermrOpc; - InsPt = MI; + + if (LIS) { + LIS->InsertMachineInstrInMaps(*CondInverted); + } + } else { + MachineInstr *CopyExec = + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DisableLanesMask) + .addReg(Exec); + if (LIS) + LIS->InsertMachineInstrInMaps(*CopyExec); + LoweredIf.insert(DisableLanesMask); } - MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec) - .addReg(Exec) - .add(MI.getOperand(0)); - if (LV) { - LV->replaceKillInstruction(DataReg, MI, *NewMI); - - if (SplitBB != &MBB) { - // Track the set of registers defined in the original block so we don't - // accidentally add the original block to AliveBlocks. AliveBlocks only - // includes blocks which are live through, which excludes live outs and - // local defs. - DenseSet DefInOrigBlock; - - for (MachineBasicBlock *BlockPiece : {&MBB, SplitBB}) { - for (MachineInstr &X : *BlockPiece) { - for (MachineOperand &Op : X.all_defs()) { - if (Op.getReg().isVirtual()) - DefInOrigBlock.insert(Op.getReg()); + MachineBasicBlock::reverse_iterator J(I); + while (J != MBB.rend() && !J->definesRegister(AMDGPU::SCC, TRI)) + ++J; + MachineInstr *EnabledLanesMaskDef = MRI->getUniqueVRegDef(EnabledLanesMask); + bool SkipSetSCC = + J != MBB.rend() && EnabledLanesMaskDef && + EnabledLanesMaskDef->getParent() == &MBB && + EnabledLanesMaskDef->definesRegister(EnabledLanesMask, TRI) && + EnabledLanesMaskDef == &*J; + if (!SkipSetSCC) { + MachineInstr *IfZeroMask = nullptr; + bool HasCmpOpc = IsWave32 || AMDGPU::isGFX8Plus(ST); + if (HasCmpOpc) { + IfZeroMask = BuildMI(MBB, I, DL, TII->get(CmpOpc)) + .addReg(EnabledLanesMask) + .addImm(0); + if (LIS) + LIS->InsertMachineInstrInMaps(*IfZeroMask); + } else { + Register TestResultReg = MRI->createVirtualRegister(BoolRC); + IfZeroMask = BuildMI(MBB, I, DL, TII->get(AndOpc), TestResultReg) + .addReg(EnabledLanesMask) + .addImm(TestMask); + if (LIS) { + LIS->InsertMachineInstrInMaps(*IfZeroMask); + LIS->createAndComputeVirtRegInterval(TestResultReg); } } } - for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { - Register Reg = Register::index2VirtReg(i); - LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); + MachineInstr *SetExecForSucc = + BuildMI(MBB, I, DL, TII->get(CmovOpc), Exec).addReg(EnabledLanesMask); - if (VI.AliveBlocks.test(MBB.getNumber())) - VI.AliveBlocks.set(SplitBB->getNumber()); - else { - for (MachineInstr *Kill : VI.Kills) { - if (Kill->getParent() == SplitBB && !DefInOrigBlock.contains(Reg)) - VI.AliveBlocks.set(MBB.getNumber()); + MachineBasicBlock *FlowBB = MI.getOperand(2).getMBB(); + MachineBasicBlock *TargetBB = nullptr; + // determine target BBs + I = skipToUncondBrOrEnd(MBB, I); + if (I != MBB.end()) { + // skipToUncondBrOrEnd returns either unconditional branch or end() + TargetBB = I->getOperand(0).getMBB(); + I->getOperand(0).setMBB(FlowBB); + } else { + // assert(MBB.succ_size() == 2); + for (auto Succ : successors(&MBB)) { + if (Succ != FlowBB) { + TargetBB = Succ; + break; } } + I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(FlowBB); + if (LIS) + LIS->InsertMachineInstrInMaps(*I); } + if (TargetBB) { + MachineInstr *NewBr = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)).addMBB(TargetBB); + if (LIS) + LIS->InsertMachineInstrInMaps(*NewBr); } + if (!LIS) { + MI.eraseFromParent(); + return; } - LoweredEndCf.insert(NewMI); + LIS->ReplaceMachineInstrInMaps(MI, *SetExecForSucc); + RecomputeRegs.insert(MI.getOperand(0).getReg()); + RecomputeRegs.insert(MI.getOperand(1).getReg()); + MI.eraseFromParent(); + LIS->removeAllRegUnitsForPhysReg(Exec); +} +void SILowerControlFlow::emitWaveReconverge(MachineInstr &MI) { + MachineBasicBlock &BB = *MI.getParent(); + Register Mask = MI.getOperand(0).getReg(); + MachineInstr *ExecRestore = + BuildMI(BB, MI, MI.getDebugLoc(), TII->get(OrTermrOpc), Exec) + .addReg(Exec) + .addReg(Mask); + if (LV) + LV->replaceKillInstruction(Mask, MI, *ExecRestore); if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *NewMI); + LIS->ReplaceMachineInstrInMaps(MI, *ExecRestore); + LoweredWaveReconverge.insert(ExecRestore); MI.eraseFromParent(); - if (LIS) - LIS->handleMove(*NewMI); - return SplitBB; } // Returns replace operands for a logical operation, either single result @@ -615,38 +560,17 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) { MRI->getUniqueVRegDef(Reg)->eraseFromParent(); } -void SILowerControlFlow::optimizeEndCf() { - // If the only instruction immediately following this END_CF is another - // END_CF in the only successor we can avoid emitting exec mask restore here. - if (!EnableOptimizeEndCf) - return; - for (MachineInstr *MI : reverse(LoweredEndCf)) { - MachineBasicBlock &MBB = *MI->getParent(); - auto Next = - skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator())); - if (Next == MBB.end() || !LoweredEndCf.count(&*Next)) - continue; - // Only skip inner END_CF if outer ENDCF belongs to SI_IF. - // If that belongs to SI_ELSE then saved mask has an inverted value. - Register SavedExec - = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg(); - assert(SavedExec.isVirtual() && "Expected saved exec to be src1!"); - - const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec); - if (Def && LoweredIf.count(SavedExec)) { - LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump()); - if (LIS) - LIS->RemoveMachineInstrFromMaps(*MI); - Register Reg; - if (LV) - Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); - MI->eraseFromParent(); - if (LV) - LV->recomputeForSingleDefVirtReg(Reg); - removeMBBifRedundant(MBB); +bool SILowerControlFlow::needMaskWithExec(const MachineInstr &MI) const { + bool SkipAnding = false; + if (MI.getOperand(1).isReg()) { + + if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) { + SkipAnding = + Def->getParent() == MI.getParent() && SIInstrInfo::isVALU(*Def); } } + return !SkipAnding; } MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) { @@ -674,11 +598,11 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) { break; case AMDGPU::SI_WATERFALL_LOOP: - MI.setDesc(TII->get(AMDGPU::S_CBRANCH_EXECNZ)); + emitWaterfallLoop(MI); break; - case AMDGPU::SI_END_CF: - SplitBB = emitEndCf(MI); + case AMDGPU::SI_WAVE_RECONVERGE: + emitWaveReconverge(MI); break; default: @@ -707,52 +631,6 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) { return SplitBB; } -bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { - for (auto &I : MBB.instrs()) { - if (!I.isDebugInstr() && !I.isUnconditionalBranch()) - return false; - } - - assert(MBB.succ_size() == 1 && "MBB has more than one successor"); - - MachineBasicBlock *Succ = *MBB.succ_begin(); - MachineBasicBlock *FallThrough = nullptr; - - while (!MBB.predecessors().empty()) { - MachineBasicBlock *P = *MBB.pred_begin(); - if (P->getFallThrough(false) == &MBB) - FallThrough = P; - P->ReplaceUsesOfBlockWith(&MBB, Succ); - } - MBB.removeSuccessor(Succ); - if (LIS) { - for (auto &I : MBB.instrs()) - LIS->RemoveMachineInstrFromMaps(I); - } - if (MDT) { - // If Succ, the single successor of MBB, is dominated by MBB, MDT needs - // updating by changing Succ's idom to the one of MBB; otherwise, MBB must - // be a leaf node in MDT and could be erased directly. - if (MDT->dominates(&MBB, Succ)) - MDT->changeImmediateDominator(MDT->getNode(Succ), - MDT->getNode(&MBB)->getIDom()); - MDT->eraseNode(&MBB); - } - MBB.clear(); - MBB.eraseFromParent(); - if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) { - // Note: we cannot update block layout and preserve live intervals; - // hence we must insert a branch. - MachineInstr *BranchMI = BuildMI(*FallThrough, FallThrough->end(), - FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH)) - .addMBB(Succ); - if (LIS) - LIS->InsertMachineInstrInMaps(*BranchMI); - } - - return true; -} - bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -772,7 +650,13 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { BoolRC = TRI->getBoolRC(); if (ST.isWave32()) { + TestMask = 0xffffffff; + Select = AMDGPU::S_CSELECT_B32_term; + CmovOpc = AMDGPU::S_CMOV_B32_term; + CmpOpc = AMDGPU::S_CMP_LG_U32_term; AndOpc = AMDGPU::S_AND_B32; + AndTermOpc = AMDGPU::S_AND_B32_term; + Andn2Opc = AMDGPU::S_ANDN2_B32; OrOpc = AMDGPU::S_OR_B32; XorOpc = AMDGPU::S_XOR_B32; MovTermOpc = AMDGPU::S_MOV_B32_term; @@ -782,7 +666,13 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; Exec = AMDGPU::EXEC_LO; } else { + TestMask = 0xffffffffffffffff; + Select = AMDGPU::S_CSELECT_B64_term; + CmovOpc = AMDGPU::S_CMOV_B64_term; + CmpOpc = AMDGPU::S_CMP_LG_U64_term; AndOpc = AMDGPU::S_AND_B64; + AndTermOpc = AMDGPU::S_AND_B64_term; + Andn2Opc = AMDGPU::S_ANDN2_B64; OrOpc = AMDGPU::S_OR_B64; XorOpc = AMDGPU::S_XOR_B64; MovTermOpc = AMDGPU::S_MOV_B64_term; @@ -835,7 +725,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_IF_BREAK: case AMDGPU::SI_WATERFALL_LOOP: case AMDGPU::SI_LOOP: - case AMDGPU::SI_END_CF: + case AMDGPU::SI_WAVE_RECONVERGE: SplitMBB = process(MI); Changed = true; break; @@ -848,8 +738,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { } } - optimizeEndCf(); - if (LIS) { for (Register Reg : RecomputeRegs) { LIS->removeInterval(Reg); @@ -858,8 +746,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { } RecomputeRegs.clear(); - LoweredEndCf.clear(); LoweredIf.clear(); + LoweredWaveReconverge.clear(); KillBlocks.clear(); return Changed; diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 93b70fa4ba974c..ac762339eb5131 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -217,6 +217,42 @@ bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const { MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); return true; } + case AMDGPU::S_CMOV_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII->get(AMDGPU::S_CMOV_B64)); + return true; + } + case AMDGPU::S_CMP_LG_U64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII->get(AMDGPU::S_CMP_LG_U64)); + return true; + } + case AMDGPU::S_CMOV_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII->get(AMDGPU::S_CMOV_B32)); + return true; + } + case AMDGPU::S_CMP_LG_U32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII->get(AMDGPU::S_CMP_LG_U32)); + return true; + } + case AMDGPU::S_CSELECT_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII->get(AMDGPU::S_CSELECT_B32)); + return true; + } + case AMDGPU::S_CSELECT_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII->get(AMDGPU::S_CSELECT_B64)); + return true; + } case AMDGPU::S_MOV_B64_term: { bool RegSrc = MI.getOperand(1).isReg(); MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index 50f536c532afc7..e9afdaa1012e0a 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -679,7 +679,7 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) { for (auto Reg : CandidateRegs) optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks); } else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) { - auto *LoopHeader = MI.getOperand(0).getMBB(); + auto *LoopHeader = MI.getOperand(2).getMBB(); auto *LoopEnd = &MBB; LLVM_DEBUG(dbgs() << "Checking Waterfall loop: " diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..a0da03c177a1de 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -4,16 +4,16 @@ # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) # CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 # CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.1, %{{[0-9]*}}:_(s32), %bb.0 # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_PHI %{{[0-9]*}}:_(s1), %bb.1, %{{[0-9]*}}:_(s1), %bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) # CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.3 # CHECK: DIVERGENT: G_BR %bb.4 # CHECK-LABEL: BLOCK bb.3 @@ -44,7 +44,7 @@ body: | %14:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) %16:_(s1) = G_ICMP intpred(slt), %14(s32), %15 %18:_(s1) = G_XOR %16, %17 - %19:_(s1), %20:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %16(s1) + %19:_(s1), %20:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %16(s1) G_BRCOND %19(s1), %bb.2 G_BR %bb.3 @@ -54,14 +54,14 @@ body: | %21:_(s32) = G_EXTRACT_VECTOR_ELT %9(<3 x s32>), %15(s32) %22:_(s32) = G_EXTRACT_VECTOR_ELT %9(<3 x s32>), %23(s32) %24:_(s1) = G_ICMP intpred(slt), %21(s32), %15 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %20(s64) bb.3: successors: %bb.4(0x40000000), %bb.5(0x40000000) %25:_(s32) = G_PHI %22(s32), %bb.2, %33(s32), %bb.1 %26:_(s1) = G_PHI %24(s1), %bb.2, %18(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %20(s64) - %27:_(s1), %28:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %26(s1) + %27:_(s1), %28:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %26(s1) G_BRCOND %27(s1), %bb.4 G_BR %bb.5 @@ -69,10 +69,10 @@ body: | successors: %bb.5(0x80000000) %29:_(s32) = G_EXTRACT_VECTOR_ELT %9(<3 x s32>), %30(s32) + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %28(s64) bb.5: %31:_(s32) = G_PHI %25(s32), %bb.3, %29(s32), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %28(s64) G_STORE %31(s32), %32(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) S_ENDPGM 0 diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir index 7bff87c09b3c9f..261334354d8cc4 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir @@ -27,15 +27,12 @@ body: | %11:_(s64) = G_PHI %12(s64), %bb.2, %15(s64), %bb.1 %18:_(s1) = G_CONSTANT i1 false - %12:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %18(s1), %11(s64) + %12:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %18(s1), %11(s64) ; CHECK: DIVERGENT: SI_LOOP SI_LOOP %12(s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.3 bb.3: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI - %14:_(s64) = G_PHI %12(s64), %bb.2 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s64) S_ENDPGM 0 ... @@ -82,15 +79,12 @@ body: | successors: %bb.5, %bb.4 %15:_(s64) = G_PHI %24(s64), %bb.2, %16(s64), %bb.4 - %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %14(s1), %15(s64) + %16:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %14(s1), %15(s64) ; CHECK: DIVERGENT: SI_LOOP SI_LOOP %16(s64), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.5 bb.5: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI - %18:_(s64) = G_PHI %16(s64), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) G_BR %bb.3 bb.6: @@ -140,15 +134,12 @@ body: | successors: %bb.5, %bb.4 %15:_(s64) = G_PHI %24(s64), %bb.2, %16(s64), %bb.4 - %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %14(s1), %15(s64) + %16:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %14(s1), %15(s64) ; CHECK: DIVERGENT: SI_LOOP SI_LOOP %16(s64), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.5 bb.5: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI - %18:_(s64) = G_PHI %16(s64), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) G_BR %bb.3 bb.6: @@ -191,17 +182,14 @@ body: | %15:_(s64) = G_PHI %25(s64), %bb.2, %16(s64), %bb.3 %24:_(s1) = G_CONSTANT i1 false - %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %24(s1), %15(s64) + %16:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %24(s1), %15(s64) ; CHECK: DIVERGENT: SI_LOOP SI_LOOP %16(s64), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.4 bb.4: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI successors: %bb.5, %bb.2 - %18:_(s64) = G_PHI %16(s64), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) G_BRCOND %13(s1), %bb.2 G_BR %bb.5 @@ -241,7 +229,7 @@ body: | bb.2: %15:_(s64) = G_PHI %16(s64), %bb.4, %19(s64), %bb.1 %24:_(s1) = G_CONSTANT i1 true - %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %24(s1), %15(s64) + %16:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %24(s1), %15(s64) bb.3: successors: %bb.4, %bb.3 @@ -257,9 +245,6 @@ body: | G_BR %bb.5 bb.5: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI - %18:_(s64) = G_PHI %16(s64), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) S_ENDPGM 0 ... @@ -291,16 +276,13 @@ body: | %10:_(s64) = G_PHI %11(s64), %bb.2, %19(s64), %bb.1 %24:_(s1) = G_CONSTANT i1 false - %11:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %24(s1), %10(s64) + %11:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %24(s1), %10(s64) ; CHECK: DIVERGENT: SI_LOOP SI_LOOP %11(s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.3 bb.3: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI ; CHECK-NOT: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI - %13:_(s64) = G_PHI %11(s64), %bb.2 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %13(s64) %14:_(p4) = COPY %3(p4) %15:_(s64) = G_CONSTANT i64 40 %16:_(p4) = G_PTR_ADD %14, %15(s64) @@ -354,15 +336,12 @@ body: | %15:_(s64) = G_PHI %23(s64), %bb.2, %16(s64), %bb.3 %25:_(s1) = G_CONSTANT i1 false - %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %25(s1), %15(s64) + %16:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %25(s1), %15(s64) ; CHECK: DIVERGENT: SI_LOOP SI_LOOP %16(s64), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.4 bb.4: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI - %18:_(s64) = G_PHI %16(s64), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) bb.5: diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir index b7e0d5449d2e8b..dbd4eb5f1b2b54 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir @@ -47,20 +47,18 @@ body: | %19:_(s32) = G_PHI %18(s32), %bb.7, %25(s32), %bb.4 %20:_(s32) = G_PHI %6(s32), %bb.7, %25(s32), %bb.4 %21:_(s1) = G_PHI %34(s1), %bb.7, %33(s1), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %16(s32) - %22:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %21(s1), %0(s32) + %22:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %21(s1), %0(s32) SI_LOOP %22(s32), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.6 bb.6: - %24:_(s32) = G_PHI %22(s32), %bb.5 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %24(s32) SI_RETURN bb.7: %34:_(s1) = G_CONSTANT i1 false %35:_(s32) = G_CONSTANT i32 1 %18:_(s32) = G_OR %2, %35 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %16(s32) G_BR %bb.5 ... diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir index d1a61100a14cb8..8b7db292fa44e4 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir @@ -47,13 +47,13 @@ body: | S_CMP_LT_I32 killed %24, killed %25, implicit-def $scc %26:sreg_64 = COPY $scc %4:sreg_64 = COPY %26 + SI_WAVE_RECONVERGE %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: successors: %bb.3(0x40000000), %bb.4(0x40000000) %5:sreg_32 = PHI %14, %bb.0, %3, %bb.1 %6:vreg_1 = PHI %1, %bb.0, %4, %bb.1 - SI_END_CF %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %27:sreg_64 = COPY %6 %7:sreg_64 = SI_IF %27, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 @@ -62,10 +62,10 @@ body: | successors: %bb.4(0x80000000) %8:sreg_32 = COPY %0.sub2 + SI_WAVE_RECONVERGE %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: %9:vgpr_32 = PHI %5, %bb.2, %8, %bb.3 - SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %28:sreg_64 = IMPLICIT_DEF %29:vreg_64 = COPY %28 GLOBAL_STORE_DWORD killed %29, %9, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 61d2c854dffa55..e21ec40966462c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -343,10 +343,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -387,10 +387,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -411,10 +411,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -435,10 +435,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -489,11 +489,11 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -532,11 +532,11 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -555,11 +555,11 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -578,11 +578,11 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -624,11 +624,11 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -661,11 +661,11 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -705,10 +705,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -731,10 +731,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -780,11 +780,11 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -816,11 +816,11 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -858,11 +858,11 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -882,11 +882,11 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -936,10 +936,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -980,10 +980,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1004,10 +1004,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1028,10 +1028,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1078,11 +1078,11 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1123,11 +1123,11 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1146,11 +1146,11 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1169,11 +1169,11 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1212,11 +1212,11 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -1249,11 +1249,11 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1293,10 +1293,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -1322,10 +1322,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1366,11 +1366,11 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1402,11 +1402,11 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1445,11 +1445,11 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1472,11 +1472,11 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1527,10 +1527,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -1583,10 +1583,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX90A-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -1614,10 +1614,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX908-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -1645,10 +1645,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX8-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -1702,11 +1702,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1757,11 +1757,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1787,11 +1787,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1817,11 +1817,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1869,11 +1869,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -1894,6 +1894,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1911,11 +1912,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -1975,10 +1976,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX908-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -2010,10 +2011,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX8-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -2060,11 +2061,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -2084,6 +2085,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2100,11 +2102,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -2161,11 +2163,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX908-NEXT: v_mov_b32_e32 v2, v7 ; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v3, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -2194,11 +2196,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, v7 ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v3, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 83be67a9138f6e..5e2c2b5aa9a5ba 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -343,10 +343,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -387,10 +387,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -411,10 +411,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -435,10 +435,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -489,11 +489,11 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -532,11 +532,11 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -555,11 +555,11 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -578,11 +578,11 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -624,11 +624,11 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -661,11 +661,11 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -705,10 +705,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -731,10 +731,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -780,11 +780,11 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -816,11 +816,11 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -858,11 +858,11 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -882,11 +882,11 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -936,10 +936,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -980,10 +980,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1004,10 +1004,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1028,10 +1028,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1078,11 +1078,11 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1123,11 +1123,11 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1146,11 +1146,11 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1169,11 +1169,11 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1212,11 +1212,11 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -1249,11 +1249,11 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1293,10 +1293,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -1322,10 +1322,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1366,11 +1366,11 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1402,11 +1402,11 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1445,11 +1445,11 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1472,11 +1472,11 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1527,10 +1527,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -1583,10 +1583,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX90A-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -1614,10 +1614,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX908-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -1645,10 +1645,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX8-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -1702,11 +1702,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1757,11 +1757,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1787,11 +1787,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1817,11 +1817,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1869,11 +1869,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -1894,6 +1894,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1911,11 +1912,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -1975,10 +1976,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX908-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -2010,10 +2011,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX8-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -2060,11 +2061,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -2084,6 +2085,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2100,11 +2102,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -2161,11 +2163,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX908-NEXT: v_mov_b32_e32 v2, v7 ; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v3, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -2194,11 +2196,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, v7 ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v3, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll index 966a481b6594dc..2a8d1b2bf56983 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll @@ -117,10 +117,10 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) { ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_or_b32 s6, s6, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -147,24 +147,25 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa ; GFX10-LABEL: divergent_i1_phi_used_inside_loop_bigger_loop_body: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 1.0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x3e8 -; GFX10-NEXT: v_mov_b32_e32 v8, s4 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 ; GFX10-NEXT: ; implicit-def: $sgpr6 ; GFX10-NEXT: s_branch .LBB3_2 ; GFX10-NEXT: .LBB3_1: ; %loop_body ; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GFX10-NEXT: s_xor_b32 s5, s5, -1 +; GFX10-NEXT: s_xor_b32 s4, s4, -1 ; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v0 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s7, exec_lo, s4 ; GFX10-NEXT: s_or_b32 s6, s6, s7 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execz .LBB3_6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s5 +; GFX10-NEXT: s_cbranch_scc0 .LBB3_6 ; GFX10-NEXT: .LBB3_2: ; %loop_start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8 @@ -185,7 +186,6 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa ; GFX10-NEXT: flat_store_dword v[4:5], v1 ; GFX10-NEXT: s_branch .LBB3_1 ; GFX10-NEXT: .LBB3_6: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[2:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir index 6594d7f5042123..b0738eabb43044 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir @@ -206,7 +206,7 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %21(s1), %bb.1 ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1 @@ -226,9 +226,7 @@ body: | ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]] @@ -263,8 +261,6 @@ body: | bb.2: %16:_(s1) = G_PHI %11(s1), %bb.1 - %17:_(s32) = G_PHI %7(s32), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32) %18:_(s32) = G_FCONSTANT float 0.000000e+00 %19:_(s32) = G_FCONSTANT float 1.000000e+00 %20:_(s32) = G_SELECT %16(s1), %19, %18 @@ -302,8 +298,8 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %42(s1), %bb.5 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %39(s1), %bb.5 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %41(s1), %bb.5 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %38(s1), %bb.5 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5 ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) @@ -357,9 +353,7 @@ body: | ; GFX10-NEXT: G_BR %bb.6 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.6: - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32) ; GFX10-NEXT: [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY13]](s1), [[C11]], [[C10]] @@ -435,8 +429,6 @@ body: | bb.6: %33:_(s1) = G_PHI %19(s1), %bb.5 - %34:_(s32) = G_PHI %15(s32), %bb.5 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32) %35:_(s32) = G_FCONSTANT float 0.000000e+00 %36:_(s32) = G_FCONSTANT float 1.000000e+00 %37:_(s32) = G_SELECT %33(s1), %36, %35 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index 49c232661c6dc1..f44dd101cd1d94 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll @@ -33,11 +33,11 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val, ; GFX10-NEXT: s_and_b32 s6, exec_lo, s6 ; GFX10-NEXT: s_or_b32 s7, s8, s7 ; GFX10-NEXT: s_or_b32 s5, s5, s6 +; GFX10-NEXT: s_andn2_b32 s8, exec_lo, s4 ; GFX10-NEXT: s_mov_b32 s6, s7 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s8, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s5 ; GFX10-NEXT: flat_store_dword v[2:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -66,42 +66,44 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s4, -1 -; GFX10-NEXT: ; implicit-def: $sgpr6 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: s_andn2_b32 s5, s4, exec_lo ; GFX10-NEXT: s_and_b32 s4, exec_lo, -1 -; GFX10-NEXT: s_or_b32 s4, s5, s4 +; GFX10-NEXT: s_or_b32 s7, s5, s4 +; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB1_2 ; GFX10-NEXT: .LBB1_1: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-NEXT: v_add_co_u32 v1, s4, v1, 4 ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v2, s4 ; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, 10, v0 -; GFX10-NEXT: s_andn2_b32 s7, s5, exec_lo -; GFX10-NEXT: s_and_b32 s8, exec_lo, s6 -; GFX10-NEXT: s_or_b32 s4, s7, s8 +; GFX10-NEXT: s_andn2_b32 s7, s6, exec_lo +; GFX10-NEXT: s_and_b32 s8, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s7, s7, s8 ; GFX10-NEXT: s_cbranch_vccz .LBB1_4 ; GFX10-NEXT: .LBB1_2: ; %loop.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo -; GFX10-NEXT: s_and_b32 s6, exec_lo, s5 -; GFX10-NEXT: s_or_b32 s6, s4, s6 -; GFX10-NEXT: s_and_saveexec_b32 s4, s5 -; GFX10-NEXT: s_cbranch_execz .LBB1_1 +; GFX10-NEXT: s_mov_b32 s6, s7 +; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo +; GFX10-NEXT: s_and_b32 s7, exec_lo, s7 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_and_b32 s7, s6, exec_lo +; GFX10-NEXT: s_cmov_b32 exec_lo, s7 +; GFX10-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX10-NEXT: ; %bb.3: ; %is.eq.zero ; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GFX10-NEXT: global_load_dword v5, v[1:2], off -; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo +; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 ; GFX10-NEXT: s_and_b32 s7, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_branch .LBB1_1 ; GFX10-NEXT: .LBB1_4: ; %exit -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[3:4], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -151,10 +153,10 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val, ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s7, exec_lo, s5 ; GFX10-NEXT: s_or_b32 s6, s6, s7 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[2:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -192,10 +194,12 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB3_6 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB3_6 ; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader ; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: ; implicit-def: $sgpr6 @@ -204,31 +208,33 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ; GFX10-NEXT: s_branch .LBB3_3 ; GFX10-NEXT: .LBB3_2: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_xor_b32 s9, s8, -1 ; GFX10-NEXT: s_and_b32 s10, exec_lo, s7 ; GFX10-NEXT: s_or_b32 s5, s10, s5 ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s9, exec_lo, s9 ; GFX10-NEXT: s_or_b32 s6, s6, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execz .LBB3_5 +; GFX10-NEXT: s_andn2_b32 s9, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s9, s5 +; GFX10-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX10-NEXT: .LBB3_3: ; %loop.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo -; GFX10-NEXT: s_and_b32 s9, exec_lo, -1 +; GFX10-NEXT: s_and_b32 s10, exec_lo, -1 ; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo -; GFX10-NEXT: s_or_b32 s8, s8, s9 +; GFX10-NEXT: s_or_b32 s8, s8, s10 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6] -; GFX10-NEXT: s_or_b32 s7, s7, s9 +; GFX10-NEXT: s_or_b32 s7, s7, s10 +; GFX10-NEXT: s_mov_b32 s9, exec_lo ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo ; GFX10-NEXT: global_load_dword v6, v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX10-NEXT: s_and_saveexec_b32 s9, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB3_2 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX10-NEXT: ; %bb.4: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v5 @@ -240,22 +246,24 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ; GFX10-NEXT: s_and_b32 s11, exec_lo, vcc_lo ; GFX10-NEXT: s_or_b32 s8, s8, s10 ; GFX10-NEXT: s_or_b32 s7, s7, s11 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_branch .LBB3_2 ; GFX10-NEXT: .LBB3_5: ; %loop.exit.guard -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_andn2_b32 s5, -1, exec_lo ; GFX10-NEXT: s_and_b32 s6, exec_lo, s6 ; GFX10-NEXT: s_or_b32 s6, s5, s6 -; GFX10-NEXT: .LBB3_6: ; %Flow1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s6 -; GFX10-NEXT: s_cbranch_execz .LBB3_8 +; GFX10-NEXT: .LBB3_6: ; %Flow1 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_and_b32 s5, s6, exec_lo +; GFX10-NEXT: s_cmov_b32 exec_lo, s5 +; GFX10-NEXT: s_cbranch_scc0 .LBB3_8 ; GFX10-NEXT: ; %bb.7: ; %block.after.loop ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: flat_store_dword v[3:4], v0 -; GFX10-NEXT: .LBB3_8: ; %exit ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB3_8: ; %exit ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -302,20 +310,22 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace ; GFX10-NEXT: s_branch .LBB4_2 ; GFX10-NEXT: .LBB4_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_and_b32 s4, exec_lo, s7 +; GFX10-NEXT: s_and_b32 s4, exec_lo, s8 ; GFX10-NEXT: s_or_b32 s5, s4, s5 ; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo ; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo ; GFX10-NEXT: s_or_b32 s6, s4, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execz .LBB4_6 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX10-NEXT: .LBB4_2: ; %cond.block.0 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 +; GFX10-NEXT: s_mov_b32 s7, exec_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 -; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB4_4 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX10-NEXT: ; %bb.3: ; %if.block.0 ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 @@ -323,31 +333,35 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace ; GFX10-NEXT: v_add_co_u32 v8, s4, v2, v8 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v3, v9, s4 ; GFX10-NEXT: global_store_dword v[8:9], v4, off -; GFX10-NEXT: .LBB4_4: ; %loop.break.block -; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; GFX10-NEXT: .LBB4_4: ; %loop.break.block +; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4 -; GFX10-NEXT: s_mov_b32 s7, -1 +; GFX10-NEXT: s_mov_b32 s7, exec_lo +; GFX10-NEXT: s_mov_b32 s8, -1 ; GFX10-NEXT: ; implicit-def: $vgpr5 -; GFX10-NEXT: s_and_saveexec_b32 s8, s4 -; GFX10-NEXT: s_cbranch_execz .LBB4_1 +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, s4 +; GFX10-NEXT: s_cbranch_scc0 .LBB4_1 ; GFX10-NEXT: ; %bb.5: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v4 ; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, 0 -; GFX10-NEXT: s_or_b32 s7, s4, s7 +; GFX10-NEXT: s_and_b32 s8, exec_lo, 0 +; GFX10-NEXT: s_or_b32 s8, s4, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; GFX10-NEXT: s_branch .LBB4_1 ; GFX10-NEXT: .LBB4_6: ; %cond.block.1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_and_saveexec_b32 s4, s6 -; GFX10-NEXT: s_cbranch_execz .LBB4_8 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_and_b32 s5, s6, exec_lo +; GFX10-NEXT: s_cmov_b32 exec_lo, s5 +; GFX10-NEXT: s_cbranch_scc0 .LBB4_8 ; GFX10-NEXT: ; %bb.7: ; %if.block.1 ; GFX10-NEXT: global_store_dword v[6:7], v4, off -; GFX10-NEXT: .LBB4_8: ; %exit ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB4_8: ; %exit ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: br label %loop.start @@ -413,7 +427,6 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa ; GFX10-NEXT: s_branch .LBB5_2 ; GFX10-NEXT: .LBB5_1: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v5 ; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 @@ -422,15 +435,18 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-NEXT: s_or_b32 s3, s3, s4 ; GFX10-NEXT: s_or_b32 s1, s1, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB5_4 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s0 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX10-NEXT: .LBB5_2: ; %loop.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 -; GFX10-NEXT: s_or_b32 s2, s2, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s3 -; GFX10-NEXT: s_cbranch_execz .LBB5_1 +; GFX10-NEXT: s_and_b32 s5, exec_lo, s3 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_and_b32 s5, s3, exec_lo +; GFX10-NEXT: s_cmov_b32 exec_lo, s5 +; GFX10-NEXT: s_cbranch_scc0 .LBB5_1 ; GFX10-NEXT: ; %bb.3: ; %is.eq.zero ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 @@ -444,9 +460,9 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa ; GFX10-NEXT: s_and_b32 s3, exec_lo, vcc_lo ; GFX10-NEXT: s_or_b32 s2, s2, s3 ; GFX10-NEXT: ; implicit-def: $sgpr3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_branch .LBB5_1 ; GFX10-NEXT: .LBB5_4: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1 ; GFX10-NEXT: flat_store_dword v[3:4], v0 ; GFX10-NEXT: s_endpgm @@ -489,31 +505,33 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_branch .LBB6_2 ; GFX10-NEXT: .LBB6_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_and_b32 s4, exec_lo, s2 ; GFX10-NEXT: s_or_b32 s0, s4, s0 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 ; GFX10-NEXT: s_or_b32 s1, s1, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB6_4 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s0 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX10-NEXT: .LBB6_2: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, -1 +; GFX10-NEXT: s_and_b32 s5, exec_lo, -1 ; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] -; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo ; GFX10-NEXT: global_load_dword v9, v[9:10], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB6_1 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB6_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 @@ -531,12 +549,15 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 ; GFX10-NEXT: global_store_dword v[7:8], v9, off +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_branch .LBB6_1 ; GFX10-NEXT: .LBB6_4: ; %loop.exit.guard -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_and_saveexec_b32 s0, s1 -; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB6_6 +; GFX10-NEXT: s_and_b32 s0, s1, exec_lo +; GFX10-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX10-NEXT: ; %bb.5: ; %break.body ; GFX10-NEXT: v_mov_b32_e32 v0, 10 ; GFX10-NEXT: global_store_dword v[4:5], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir index 5bbe3e48868998..44852efb2315ed 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir @@ -30,8 +30,8 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %36(s1), %bb.1 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.0, %24(s1), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %35(s1), %bb.1 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.0, %23(s1), %bb.1 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) @@ -56,9 +56,7 @@ body: | ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1 ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_2]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY11]](s1), [[C5]], [[C4]] @@ -95,8 +93,6 @@ body: | bb.2: %18:_(s1) = G_PHI %12(s1), %bb.1 - %19:_(s32) = G_PHI %9(s32), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %19(s32) %20:_(s32) = G_FCONSTANT float 0.000000e+00 %21:_(s32) = G_FCONSTANT float 1.000000e+00 %22:_(s32) = G_SELECT %18(s1), %21, %20 @@ -155,6 +151,7 @@ body: | ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C2]] ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc @@ -165,7 +162,6 @@ body: | ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.2 ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY12]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[PHI3]], [[C3]](s64) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 @@ -214,12 +210,12 @@ body: | %15:_(s32) = G_LOAD %10(p1) :: (load (s32), addrspace 1) %16:_(s32) = G_CONSTANT i32 0 %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %14(s32) bb.3: successors: %bb.4(0x04000000), %bb.1(0x7c000000) %13:_(s1) = G_PHI %17(s1), %bb.2, %12(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32) %18:_(s64) = G_CONSTANT i64 4 %11:_(p1) = G_PTR_ADD %10, %18(s64) %19:_(s32) = G_CONSTANT i32 1 @@ -262,8 +258,8 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %27(s1), %bb.1 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, %24(s1), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %26(s1), %bb.1 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, %23(s1), %bb.1 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) @@ -284,9 +280,7 @@ body: | ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1 ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY9]](s1), [[C5]], [[C4]] @@ -323,8 +317,6 @@ body: | bb.2: %18:_(s1) = G_PHI %13(s1), %bb.1 - %19:_(s32) = G_PHI %9(s32), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %19(s32) %20:_(s32) = G_FCONSTANT float 0.000000e+00 %21:_(s32) = G_FCONSTANT float 1.000000e+00 %22:_(s32) = G_SELECT %18(s1), %21, %20 @@ -370,18 +362,17 @@ body: | ; GFX10-NEXT: bb.2: ; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %40(s1), %bb.8 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %39(s1), %bb.8 ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.3: ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.1, %73(s1), %bb.7 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.1, %62(s1), %bb.7 - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.1, %49(s1), %bb.7 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.1, %72(s1), %bb.7 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.1, %61(s1), %bb.7 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.1, %48(s1), %bb.7 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7 ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1 ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1) @@ -417,6 +408,7 @@ body: | ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI5]], [[C7]] ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI5]](s32), [[COPY]] ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF2]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc @@ -430,9 +422,9 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 ; GFX10-NEXT: G_STORE [[C8]](s32), [[MV1]](p0) :: (store (s32)) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF1]](s32) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.6: - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32) ; GFX10-NEXT: SI_RETURN ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.7: @@ -443,7 +435,6 @@ body: | ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3 ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32) ; GFX10-NEXT: [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY18]], [[C9]] ; GFX10-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) @@ -457,10 +448,9 @@ body: | ; GFX10-NEXT: bb.8: ; GFX10-NEXT: successors: %bb.2(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.7 ; GFX10-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_4]](s1) ; GFX10-NEXT: [[COPY21:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY20]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc @@ -493,7 +483,6 @@ body: | successors: %bb.5(0x40000000), %bb.6(0x40000000) %13:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.8, %10(s1), %bb.0 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %11(s32) %15:sreg_32_xm0_xexec(s32) = SI_IF %13(s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.5 @@ -520,6 +509,7 @@ body: | %30:_(s32) = G_CONSTANT i32 1 %31:_(s32) = G_ADD %18, %30 %32:_(s1) = G_ICMP intpred(slt), %18(s32), %0 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %28(s32) G_BR %bb.7 bb.5: @@ -527,9 +517,9 @@ body: | %33:_(s32) = G_CONSTANT i32 5 G_STORE %33(s32), %6(p0) :: (store (s32)) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %15(s32) bb.6: - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32) SI_RETURN bb.7: @@ -538,7 +528,6 @@ body: | %19:_(s32) = G_PHI %31(s32), %bb.4, %7(s32), %bb.3 %34:_(s1) = G_PHI %29(s1), %bb.4, %20(s1), %bb.3 %35:_(s1) = G_PHI %32(s1), %bb.4, %20(s1), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %28(s32) %36:_(s1) = G_CONSTANT i1 true %37:_(s1) = G_XOR %34, %36 %17:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %35(s1), %16(s32) @@ -549,8 +538,7 @@ body: | successors: %bb.2(0x80000000) %14:_(s1) = G_PHI %37(s1), %bb.7 - %38:_(s32) = G_PHI %17(s32), %bb.7 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %38(s32) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %11(s32) G_BR %bb.2 ... @@ -579,7 +567,7 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF1]](s1), %bb.0, %39(s1), %bb.6 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.6 ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) @@ -600,12 +588,12 @@ body: | ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C1]](s32) ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64) ; GFX10-NEXT: G_STORE [[PHI2]](s32), [[PTR_ADD]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: ; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI2]] ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1) ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[COPY8]](s1) @@ -619,6 +607,7 @@ body: | ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C4]] + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF1]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc @@ -629,7 +618,6 @@ body: | ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4 ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32) ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc @@ -640,20 +628,18 @@ body: | ; GFX10-NEXT: bb.7: ; GFX10-NEXT: successors: %bb.8(0x40000000), %bb.9(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.6 - ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI2]](s32), %bb.6 + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[PHI2]](s32), %bb.6 ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_1]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32) ; GFX10-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY12]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.8 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.8: ; GFX10-NEXT: successors: %bb.9(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: G_STORE [[PHI6]](s32), [[MV1]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_STORE [[PHI5]](s32), [[MV1]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF2]](s32) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.9: - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32) ; GFX10-NEXT: SI_RETURN bb.0: successors: %bb.1(0x80000000) @@ -691,12 +677,12 @@ body: | %18:_(s64) = G_SHL %16, %17(s32) %19:_(p1) = G_PTR_ADD %4, %18(s64) G_STORE %12(s32), %19(p1) :: (store (s32), addrspace 1) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %15(s32) bb.4: successors: %bb.5(0x40000000), %bb.6(0x40000000) %20:_(s1) = G_CONSTANT i1 true - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32) %21:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %1(s32), %12 %22:sreg_32_xm0_xexec(s32) = SI_IF %21(s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.5 @@ -707,13 +693,13 @@ body: | %23:_(s1) = G_CONSTANT i1 false %24:_(s32) = G_CONSTANT i32 1 %25:_(s32) = G_ADD %12, %24 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %22(s32) bb.6: successors: %bb.7(0x04000000), %bb.1(0x7c000000) %13:_(s32) = G_PHI %25(s32), %bb.5, %9(s32), %bb.4 %26:_(s1) = G_PHI %23(s1), %bb.5, %20(s1), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %22(s32) %11:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %26(s1), %10(s32) SI_LOOP %11(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.7 @@ -721,10 +707,8 @@ body: | bb.7: successors: %bb.8(0x40000000), %bb.9(0x40000000) - %27:_(s32) = G_PHI %11(s32), %bb.6 %28:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.6 %29:_(s32) = G_PHI %12(s32), %bb.6 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32) %30:sreg_32_xm0_xexec(s32) = SI_IF %28(s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.8 @@ -732,9 +716,9 @@ body: | successors: %bb.9(0x80000000) G_STORE %29(s32), %7(p1) :: (store (s32), addrspace 1) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %30(s32) bb.9: - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %30(s32) SI_RETURN ... @@ -764,9 +748,9 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %54(s1), %bb.3 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %43(s1), %bb.3 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %33(s1), %bb.3 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %53(s1), %bb.3 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %42(s1), %bb.3 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %32(s1), %bb.3 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) @@ -791,6 +775,7 @@ body: | ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C3]] ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc @@ -803,7 +788,6 @@ body: | ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[PHI2]](s1), %bb.1, [[DEF2]](s1), %bb.2 ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY12]] ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[FREEZE]](s1) @@ -821,9 +805,7 @@ body: | ; GFX10-NEXT: G_BR %bb.4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: - ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3 ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_3]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32) ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY16]](s1), [[C6]], [[C5]] @@ -862,12 +844,12 @@ body: | %20:_(s32) = G_LOAD %19(p1) :: (load (s32), addrspace 1) %21:_(s32) = G_CONSTANT i32 0 %22:_(s1) = G_ICMP intpred(eq), %20(s32), %21 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %15(s32) bb.3: successors: %bb.4(0x04000000), %bb.1(0x7c000000) %23:_(s1) = G_PHI %22(s1), %bb.2, %13(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32) %14:_(s1) = G_FREEZE %23 %24:_(s32) = G_CONSTANT i32 1 %12:_(s32) = G_ADD %11, %24 @@ -878,8 +860,6 @@ body: | bb.4: %26:_(s1) = G_PHI %14(s1), %bb.3 - %27:_(s32) = G_PHI %10(s32), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32) %28:_(s32) = G_FCONSTANT float 0.000000e+00 %29:_(s32) = G_FCONSTANT float 1.000000e+00 %30:_(s32) = G_SELECT %26(s1), %29, %28 @@ -915,9 +895,9 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %56(s1), %bb.5 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %43(s1), %bb.5 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %66(s1), %bb.5 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %55(s1), %bb.5 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %42(s1), %bb.5 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) @@ -949,6 +929,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 ; GFX10-NEXT: G_STORE [[C4]](s32), [[MV2]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %25(s32) ; GFX10-NEXT: G_BR %bb.4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.3: @@ -967,6 +948,7 @@ body: | ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI4]](s32), [[C8]] ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc @@ -976,7 +958,6 @@ body: | ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32) ; GFX10-NEXT: S_ENDPGM 0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.5: @@ -988,7 +969,6 @@ body: | ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc @@ -999,9 +979,7 @@ body: | ; GFX10-NEXT: bb.6: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5 ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32) ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 bb.0: @@ -1041,6 +1019,7 @@ body: | %24:_(s32) = G_CONSTANT i32 10 G_STORE %24(s32), %8(p1) :: (store (s32), addrspace 1) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %35(s32) G_BR %bb.4 bb.3: @@ -1057,10 +1036,10 @@ body: | %32:_(s32) = G_ADD %13, %30 %33:_(s32) = G_CONSTANT i32 100 %34:_(s1) = G_ICMP intpred(ult), %13(s32), %33 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %23(s32) G_BR %bb.5 bb.4: - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32) S_ENDPGM 0 bb.5: @@ -1069,7 +1048,6 @@ body: | %14:_(s32) = G_PHI %32(s32), %bb.3, %10(s32), %bb.1 %36:_(s1) = G_PHI %25(s1), %bb.3, %15(s1), %bb.1 %37:_(s1) = G_PHI %34(s1), %bb.3, %15(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32) %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %37(s1), %11(s32) SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.6 @@ -1078,8 +1056,6 @@ body: | successors: %bb.2(0x40000000), %bb.4(0x40000000) %38:sreg_32_xm0_xexec(s1) = G_PHI %36(s1), %bb.5 - %39:_(s32) = G_PHI %12(s32), %bb.5 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32) %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.2 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll index 1698f84eea5185..daa0107f7e82b5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll @@ -7,17 +7,20 @@ define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid, i32 %cond) { ; GFX10-LABEL: divergent_i1_phi_if_then: ; GFX10: ; %bb.0: ; %A -; GFX10-NEXT: v_cmp_le_u32_e64 s0, 6, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, 6, v2 +; GFX10-NEXT: s_mov_b32 s0, exec_lo +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10-NEXT: ; %bb.1: ; %B ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0 +; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: .LBB0_2: ; %exit +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -41,26 +44,32 @@ exit: define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %cond) { ; GFX10-LABEL: divergent_i1_phi_if_else: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %B ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 ; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr2 ; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo ; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: ; %bb.2: ; %Flow -; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: .LBB1_2: ; %Flow +; GFX10-NEXT: s_xor_b32 s2, s1, exec_lo +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, s1 +; GFX10-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX10-NEXT: ; %bb.3: ; %A ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo -; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: ; %bb.4: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s1, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: .LBB1_4: ; %exit ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -111,26 +120,28 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_branch .LBB2_2 ; GFX10-NEXT: .LBB2_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_and_b32 s2, exec_lo, s1 ; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB2_4 +; GFX10-NEXT: s_andn2_b32 s2, exec_lo, s0 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s2, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10-NEXT: .LBB2_2: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s2, exec_lo, -1 -; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s3, exec_lo, -1 +; GFX10-NEXT: s_mov_b32 s2, exec_lo +; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 2, v[4:5] ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v2, v5 ; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v3, v6, vcc_lo ; GFX10-NEXT: global_load_dword v7, v[7:8], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB2_1 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v0, v5 @@ -145,6 +156,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 1, v7 ; GFX10-NEXT: global_store_dword v[5:6], v7, off +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_branch .LBB2_1 ; GFX10-NEXT: .LBB2_4: ; %exit ; GFX10-NEXT: s_endpgm @@ -180,42 +193,46 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_branch .LBB3_3 ; GFX10-NEXT: .LBB3_1: ; %Flow3 ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-NEXT: s_and_b32 s3, exec_lo, s4 ; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: .LBB3_2: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_and_b32 s2, exec_lo, s1 ; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB3_6 +; GFX10-NEXT: s_andn2_b32 s2, exec_lo, s0 +; GFX10-NEXT: s_cselect_b32 exec_lo, s2, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB3_6 ; GFX10-NEXT: .LBB3_3: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s2, exec_lo, -1 -; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s3, exec_lo, -1 +; GFX10-NEXT: s_mov_b32 s2, exec_lo +; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo ; GFX10-NEXT: global_load_dword v9, v[9:10], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB3_2 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX10-NEXT: ; %bb.4: ; %B ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo +; GFX10-NEXT: s_mov_b32 s3, exec_lo ; GFX10-NEXT: s_mov_b32 s4, -1 ; GFX10-NEXT: global_load_dword v9, v[9:10], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB3_1 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX10-NEXT: ; %bb.5: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 @@ -230,6 +247,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 ; GFX10-NEXT: global_store_dword v[7:8], v9, off +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: s_branch .LBB3_1 ; GFX10-NEXT: .LBB3_6: ; %exit ; GFX10-NEXT: s_endpgm @@ -271,58 +290,64 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_branch .LBB4_4 ; GFX10-NEXT: .LBB4_1: ; %Flow5 ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo ; GFX10-NEXT: s_and_b32 s5, exec_lo, s5 ; GFX10-NEXT: s_or_b32 s4, s4, s5 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: .LBB4_2: ; %Flow4 ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-NEXT: s_and_b32 s3, exec_lo, s4 ; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: .LBB4_3: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_and_b32 s2, exec_lo, s1 ; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB4_8 +; GFX10-NEXT: s_andn2_b32 s2, exec_lo, s0 +; GFX10-NEXT: s_cselect_b32 exec_lo, s2, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB4_8 ; GFX10-NEXT: .LBB4_4: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s2, exec_lo, -1 -; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s3, exec_lo, -1 +; GFX10-NEXT: s_mov_b32 s2, exec_lo +; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: v_lshlrev_b64 v[9:10], 2, v[8:9] ; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v2, v9 ; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v3, v10, vcc_lo ; GFX10-NEXT: global_load_dword v11, v[11:12], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB4_3 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX10-NEXT: ; %bb.5: ; %B ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 ; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v4, v9 ; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo +; GFX10-NEXT: s_mov_b32 s3, exec_lo ; GFX10-NEXT: s_mov_b32 s4, -1 ; GFX10-NEXT: global_load_dword v11, v[11:12], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB4_2 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX10-NEXT: ; %bb.6: ; %C ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 ; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v6, v9 ; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: s_mov_b32 s5, -1 ; GFX10-NEXT: global_load_dword v11, v[11:12], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB4_1 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB4_1 ; GFX10-NEXT: ; %bb.7: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v0, v9 @@ -337,6 +362,8 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v11 ; GFX10-NEXT: global_store_dword v[9:10], v11, off +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_branch .LBB4_1 ; GFX10-NEXT: .LBB4_8: ; %exit ; GFX10-NEXT: s_endpgm @@ -390,31 +417,33 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad ; GFX10-NEXT: s_branch .LBB5_2 ; GFX10-NEXT: .LBB5_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_and_b32 s4, exec_lo, s2 ; GFX10-NEXT: s_or_b32 s0, s4, s0 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 ; GFX10-NEXT: s_or_b32 s1, s1, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB5_4 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s0 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX10-NEXT: .LBB5_2: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, -1 +; GFX10-NEXT: s_and_b32 s5, exec_lo, -1 ; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] -; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo ; GFX10-NEXT: global_load_dword v9, v[9:10], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB5_1 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB5_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 @@ -432,12 +461,15 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 ; GFX10-NEXT: global_store_dword v[7:8], v9, off +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_branch .LBB5_1 ; GFX10-NEXT: .LBB5_4: ; %loop.exit.guard -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_and_saveexec_b32 s0, s1 -; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB5_6 +; GFX10-NEXT: s_and_b32 s0, s1, exec_lo +; GFX10-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB5_6 ; GFX10-NEXT: ; %bb.5: ; %break.body ; GFX10-NEXT: v_mov_b32_e32 v0, 10 ; GFX10-NEXT: global_store_dword v[4:5], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir index 1d291eeab8e9d7..4bae48683e1b7f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir @@ -38,7 +38,7 @@ body: | ; GFX10-NEXT: bb.2: ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1 ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY7]](s1), [[C4]], [[C3]] @@ -68,7 +68,7 @@ body: | bb.2: %12:_(s1) = G_PHI %6(s1), %bb.0, %11(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %9(s32) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %9(s32) %13:_(s32) = G_CONSTANT i32 2 %14:_(s32) = G_CONSTANT i32 1 %15:_(s32) = G_SELECT %12(s1), %14, %13 @@ -115,6 +115,7 @@ body: | ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C1]] ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_ELSE]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc @@ -134,7 +135,6 @@ body: | ; GFX10-NEXT: bb.4: ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY7]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2 ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY11]](s1), [[C3]], [[C4]] @@ -167,6 +167,7 @@ body: | %12:_(s32) = G_CONSTANT i32 1 %13:_(s1) = G_ICMP intpred(uge), %3(s32), %12 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %11(s32) G_BR %bb.4 bb.3: @@ -178,7 +179,6 @@ body: | bb.4: %15:_(s1) = G_PHI %9(s1), %bb.1, %13(s1), %bb.2 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %11(s32) %16:_(s32) = G_CONSTANT i32 1 %17:_(s32) = G_CONSTANT i32 2 %18:_(s32) = G_SELECT %15(s1), %16, %17 @@ -209,7 +209,7 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %35(s1), %bb.3 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %34(s1), %bb.3 ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.3, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.3 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) @@ -243,6 +243,7 @@ body: | ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C6]] ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc @@ -253,14 +254,11 @@ body: | ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1 ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), [[PHI1]](s32) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3 - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32) ; GFX10-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x80000000) @@ -304,20 +302,18 @@ body: | %27:_(s32) = G_ADD %10, %25 %28:_(s32) = G_CONSTANT i32 100 %29:_(s1) = G_ICMP intpred(ult), %10(s32), %28 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %20(s32) bb.3: successors: %bb.4(0x04000000), %bb.1(0x7c000000) %11:_(s32) = G_PHI %27(s32), %bb.2, %7(s32), %bb.1 %30:_(s1) = G_PHI %29(s1), %bb.2, %12(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %20(s32) %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %30(s1), %8(s32) SI_LOOP %9(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.4 bb.4: - %31:_(s32) = G_PHI %9(s32), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %31(s32) S_ENDPGM 0 ... @@ -347,7 +343,7 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %48(s1), %bb.3 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %47(s1), %bb.3 ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.3, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.3 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) @@ -385,10 +381,9 @@ body: | ; GFX10-NEXT: bb.3: ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %46(s1), %bb.5 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1 ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.6 @@ -407,6 +402,7 @@ body: | ; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C9]] ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF1]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc @@ -418,15 +414,13 @@ body: | ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY13]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc ; GFX10-NEXT: G_BR %bb.3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.6: - ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3 - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32) ; GFX10-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x80000000) @@ -478,7 +472,6 @@ body: | %14:_(s32) = G_PHI %32(s32), %bb.5, %10(s32), %bb.1 %33:_(s1) = G_PHI %34(s1), %bb.5, %15(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32) %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %33(s1), %11(s32) SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.6 @@ -496,18 +489,17 @@ body: | %41:_(s32) = G_ADD %13, %39 %42:_(s32) = G_CONSTANT i32 100 %43:_(s1) = G_ICMP intpred(ult), %13(s32), %42 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %31(s32) bb.5: successors: %bb.3(0x80000000) %32:_(s32) = G_PHI %41(s32), %bb.4, %10(s32), %bb.2 %34:_(s1) = G_PHI %43(s1), %bb.4, %24(s1), %bb.2 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %31(s32) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %23(s32) G_BR %bb.3 bb.6: - %44:_(s32) = G_PHI %12(s32), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %44(s32) S_ENDPGM 0 ... @@ -540,7 +532,7 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %61(s1), %bb.3 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %60(s1), %bb.3 ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3 ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) @@ -578,10 +570,9 @@ body: | ; GFX10-NEXT: bb.3: ; GFX10-NEXT: successors: %bb.8(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %59(s1), %bb.5 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI1]](s32) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.8 @@ -604,11 +595,11 @@ body: | ; GFX10-NEXT: bb.5: ; GFX10-NEXT: successors: %bb.3(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY11]](s1), %bb.2, %72(s1), %bb.7 + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY11]](s1), %bb.2, %71(s1), %bb.7 ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2 ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[COPY16]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc @@ -628,6 +619,7 @@ body: | ; GFX10-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 ; GFX10-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C12]] ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP3]](s1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF2]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY15]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc @@ -639,15 +631,13 @@ body: | ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4 ; GFX10-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1) ; GFX10-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[COPY19]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF1]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY20]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.8: - ; GFX10-NEXT: [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3 - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32) ; GFX10-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x80000000) @@ -702,7 +692,6 @@ body: | %17:_(s32) = G_PHI %35(s32), %bb.5, %13(s32), %bb.1 %36:_(s1) = G_PHI %37(s1), %bb.5, %18(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %26(s32) %15:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %36(s1), %14(s32) SI_LOOP %15(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.8 @@ -725,7 +714,7 @@ body: | %35:_(s32) = G_PHI %46(s32), %bb.7, %13(s32), %bb.2 %37:_(s1) = G_PHI %47(s1), %bb.7, %27(s1), %bb.2 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %26(s32) G_BR %bb.3 bb.6: @@ -741,18 +730,17 @@ body: | %54:_(s32) = G_ADD %16, %52 %55:_(s32) = G_CONSTANT i32 100 %56:_(s1) = G_ICMP intpred(ult), %16(s32), %55 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %45(s32) bb.7: successors: %bb.5(0x80000000) %46:_(s32) = G_PHI %54(s32), %bb.6, %13(s32), %bb.4 %47:_(s1) = G_PHI %56(s1), %bb.6, %38(s1), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %45(s32) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %34(s32) G_BR %bb.5 bb.8: - %57:_(s32) = G_PHI %15(s32), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %57(s32) S_ENDPGM 0 ... @@ -784,9 +772,9 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %56(s1), %bb.5 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %43(s1), %bb.5 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %66(s1), %bb.5 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %55(s1), %bb.5 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %42(s1), %bb.5 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) @@ -818,6 +806,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 ; GFX10-NEXT: G_STORE [[C4]](s32), [[MV2]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %25(s32) ; GFX10-NEXT: G_BR %bb.4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.3: @@ -836,6 +825,7 @@ body: | ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI4]](s32), [[C8]] ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc @@ -845,7 +835,6 @@ body: | ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32) ; GFX10-NEXT: S_ENDPGM 0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.5: @@ -857,7 +846,6 @@ body: | ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc @@ -868,9 +856,7 @@ body: | ; GFX10-NEXT: bb.6: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5 ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32) ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 bb.0: @@ -910,6 +896,7 @@ body: | %24:_(s32) = G_CONSTANT i32 10 G_STORE %24(s32), %8(p1) :: (store (s32), addrspace 1) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %35(s32) G_BR %bb.4 bb.3: @@ -926,10 +913,10 @@ body: | %32:_(s32) = G_ADD %13, %30 %33:_(s32) = G_CONSTANT i32 100 %34:_(s1) = G_ICMP intpred(ult), %13(s32), %33 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %23(s32) G_BR %bb.5 bb.4: - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32) S_ENDPGM 0 bb.5: @@ -938,7 +925,6 @@ body: | %14:_(s32) = G_PHI %32(s32), %bb.3, %10(s32), %bb.1 %36:_(s1) = G_PHI %25(s1), %bb.3, %15(s1), %bb.1 %37:_(s1) = G_PHI %34(s1), %bb.3, %15(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32) %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %37(s1), %11(s32) SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.6 @@ -947,8 +933,6 @@ body: | successors: %bb.2(0x40000000), %bb.4(0x40000000) %38:sreg_32_xm0_xexec(s1) = G_PHI %36(s1), %bb.5 - %39:_(s32) = G_PHI %12(s32), %bb.5 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32) %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.2 ... @@ -989,15 +973,14 @@ body: | ; GFX10-NEXT: bb.2: ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %67(s1), %bb.6, %71(s1), %bb.7 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI %49(s1), %bb.6, %48(s1), %bb.7 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI %35(s1), %bb.6, %34(s1), %bb.7 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %65(s1), %bb.6, %69(s1), %bb.7 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI %47(s1), %bb.6, %46(s1), %bb.7 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI %33(s1), %bb.6, %32(s1), %bb.7 ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1) ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1) ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY10]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32) - ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY9]](s1), %17(s32) + ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY9]](s1), %16(s32) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc @@ -1008,7 +991,7 @@ body: | ; GFX10-NEXT: bb.3: ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.3(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %19(s32), %bb.3 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %18(s32), %bb.3 ; GFX10-NEXT: [[INT1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32) ; GFX10-NEXT: SI_LOOP [[INT1]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.6 @@ -1016,17 +999,16 @@ body: | ; GFX10-NEXT: bb.4: ; GFX10-NEXT: successors: %bb.5(0x04000000), %bb.7(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY5]](s32), [[COPY]] ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C2]](s1) ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C2]] ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP2]], [[XOR]] - ; GFX10-NEXT: [[INT2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[OR]](s1), %25(s32) + ; GFX10-NEXT: [[INT2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[OR]](s1), %24(s32) ; GFX10-NEXT: [[DEF4:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: [[DEF5:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF - ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %63(s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %61(s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc @@ -1036,9 +1018,7 @@ body: | ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.5: - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT2]](s32), %bb.4 ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32) ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY15]](s1), [[COPY3]], [[COPY2]] ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SELECT]](s32) ; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) @@ -1047,15 +1027,14 @@ body: | ; GFX10-NEXT: bb.6: ; GFX10-NEXT: successors: %bb.2(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INT1]](s32), %bb.3 ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32) - ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %42(s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %28(s32) + ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %40(s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %56(s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %54(s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc ; GFX10-NEXT: [[DEF6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF @@ -1064,16 +1043,16 @@ body: | ; GFX10-NEXT: bb.7: ; GFX10-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY7]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4 - ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.0, [[PHI7]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 - ; GFX10-NEXT: [[PHI8:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, [[PHI1]](s1), %bb.2, [[DEF5]](s1), %bb.4 - ; GFX10-NEXT: [[PHI9:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[PHI2]](s1), %bb.2, [[DEF4]](s1), %bb.4 - ; GFX10-NEXT: [[PHI10:%[0-9]+]]:_(s32) = G_PHI [[INT2]](s32), %bb.4, [[PHI10]](s32), %bb.2, [[C]](s32), %bb.0 - ; GFX10-NEXT: [[PHI11:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INT]](s32), %bb.2, [[C]](s32), %bb.0 - ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1) - ; GFX10-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1) - ; GFX10-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]](s1) - ; GFX10-NEXT: [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]](s1) + ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY7]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4 + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.0, [[PHI5]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 + ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, [[PHI1]](s1), %bb.2, [[DEF5]](s1), %bb.4 + ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[PHI2]](s1), %bb.2, [[DEF4]](s1), %bb.4 + ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INT2]](s32), %bb.4, [[PHI8]](s32), %bb.2, [[C]](s32), %bb.0 + ; GFX10-NEXT: [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INT]](s32), %bb.2, [[C]](s32), %bb.0 + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI4]](s1) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1) ; GFX10-NEXT: [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY21]](s1), $exec_lo, implicit-def $scc @@ -1113,7 +1092,6 @@ body: | %11:_(s1) = G_PHI %12(s1), %bb.6, %7(s1), %bb.7 %13:_(s1) = G_PHI %12(s1), %bb.6, %14(s1), %bb.7 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32) %16:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %13(s1), %17(s32) SI_LOOP %16(s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.4 @@ -1129,7 +1107,6 @@ body: | bb.4: successors: %bb.5(0x04000000), %bb.7(0x7c000000) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %16(s32) %20:_(s1) = G_ICMP intpred(sgt), %5(s32), %0 %21:_(s1) = G_CONSTANT i1 true %22:_(s1) = G_XOR %8, %21 @@ -1140,8 +1117,6 @@ body: | bb.5: %26:_(s1) = G_PHI %20(s1), %bb.4 - %27:_(s32) = G_PHI %24(s32), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32) %28:_(s32) = G_SELECT %26(s1), %3, %2 %29:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %28(s32) $sgpr0 = COPY %29(s32) @@ -1150,9 +1125,8 @@ body: | bb.6: successors: %bb.2(0x80000000) - %30:_(s32) = G_PHI %19(s32), %bb.3 %12:_(s1) = G_CONSTANT i1 false - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %30(s32) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %15(s32) G_BR %bb.2 bb.7: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll index 1855ede0483def..e5766857628d37 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll @@ -21,10 +21,10 @@ define void @temporal_divergent_i1_phi(float %val, ptr %addr) { ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_or_b32 s6, s6, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -67,10 +67,10 @@ define void @temporal_divergent_i1_non_phi(float %val, ptr %addr) { ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_or_b32 s6, s6, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -129,8 +129,9 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad ; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-NEXT: s_and_b32 s5, exec_lo, s5 ; GFX10-NEXT: s_or_b32 s0, s0, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execz .LBB2_5 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc0 .LBB2_5 ; GFX10-NEXT: .LBB2_3: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 @@ -149,10 +150,11 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad ; GFX10-NEXT: ; implicit-def: $vgpr5 ; GFX10-NEXT: s_branch .LBB2_2 ; GFX10-NEXT: .LBB2_5: ; %loop.exit.guard -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s1, s0 -; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10-NEXT: s_cbranch_execz .LBB2_7 +; GFX10-NEXT: s_and_b32 s0, s0, exec_lo +; GFX10-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB2_7 ; GFX10-NEXT: ; %bb.6: ; %break.body ; GFX10-NEXT: v_mov_b32_e32 v0, 10 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir index fb436623bed2d5..9cf54f57c2f7f2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir @@ -22,7 +22,7 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %21(s1), %bb.1 ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1 @@ -42,9 +42,7 @@ body: | ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]] @@ -79,8 +77,6 @@ body: | bb.2: %16:_(s1) = G_PHI %10(s1), %bb.1 - %17:_(s32) = G_PHI %7(s32), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32) %18:_(s32) = G_FCONSTANT float 0.000000e+00 %19:_(s32) = G_FCONSTANT float 1.000000e+00 %20:_(s32) = G_SELECT %16(s1), %19, %18 @@ -109,7 +105,7 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %21(s1), %bb.1 ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1 @@ -129,9 +125,7 @@ body: | ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]] @@ -166,8 +160,6 @@ body: | bb.2: %16:_(s1) = G_PHI %11(s1), %bb.1 - %17:_(s32) = G_PHI %7(s32), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32) %18:_(s32) = G_FCONSTANT float 0.000000e+00 %19:_(s32) = G_FCONSTANT float 1.000000e+00 %20:_(s32) = G_SELECT %16(s1), %19, %18 @@ -203,8 +195,8 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.3(0x50000000), %bb.5(0x30000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF2]](s1), %bb.0, %53(s1), %bb.5 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %42(s1), %bb.5 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF2]](s1), %bb.0, %52(s1), %bb.5 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %41(s1), %bb.5 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5 ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) @@ -230,6 +222,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 ; GFX10-NEXT: G_STORE [[C4]](s32), [[MV2]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %25(s32) ; GFX10-NEXT: G_BR %bb.4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.3: @@ -252,7 +245,6 @@ body: | ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32) ; GFX10-NEXT: S_ENDPGM 0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.5: @@ -273,9 +265,7 @@ body: | ; GFX10-NEXT: bb.6: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5 ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_2]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32) ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY14]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 bb.0: @@ -316,6 +306,7 @@ body: | %24:_(s32) = G_CONSTANT i32 10 G_STORE %24(s32), %9(p1) :: (store (s32), addrspace 1) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %34(s32) G_BR %bb.4 bb.3: @@ -334,7 +325,6 @@ body: | G_BR %bb.5 bb.4: - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32) S_ENDPGM 0 bb.5: @@ -351,8 +341,6 @@ body: | successors: %bb.2(0x40000000), %bb.4(0x40000000) %37:sreg_32_xm0_xexec(s1) = G_PHI %35(s1), %bb.5 - %38:_(s32) = G_PHI %13(s32), %bb.5 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %38(s32) %34:sreg_32_xm0_xexec(s32) = SI_IF %37(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.2 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll index 1934958ea8f37c..3795235192e67a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll @@ -14,10 +14,10 @@ define void @temporal_divergent_i32(float %val, ptr %addr) { ; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: flat_store_dword v[1:2], v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir index d1b473f2f41d87..8c83b85fdcfce3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir @@ -33,8 +33,6 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.1 - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1 - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32) ; GFX10-NEXT: G_STORE [[PHI2]](s32), [[MV]](p0) :: (store (s32)) ; GFX10-NEXT: SI_RETURN bb.0: @@ -63,8 +61,6 @@ body: | bb.2: %13:_(s32) = G_PHI %9(s32), %bb.1 - %14:_(s32) = G_PHI %7(s32), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32) G_STORE %13(s32), %3(p0) :: (store (s32)) SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index f52b7c635a66f1..db47bd67730ae1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -8,14 +8,16 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %if.true ; CHECK-NEXT: global_load_dword v0, v[0:1], off glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: .LBB0_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB0_2: ; %endif ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %c = icmp ne i32 %value, 0 @@ -35,14 +37,16 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_cbranch_execz .LBB1_2 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB1_2 ; CHECK-NEXT: ; %bb.1: ; %if.true ; CHECK-NEXT: global_load_dword v0, v[0:1], off glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: .LBB1_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB1_2: ; %endif ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %c = icmp ne i32 %value, 0 @@ -64,14 +68,16 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_cbranch_execz .LBB2_2 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB2_2 ; CHECK-NEXT: ; %bb.1: ; %if.true ; CHECK-NEXT: global_load_dword v0, v[0:1], off glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: .LBB2_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB2_2: ; %endif ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %c = trunc i32 %value to i1 @@ -92,17 +98,19 @@ define i32 @divergent_if_nonboolean_condition1(ptr addrspace(1) %ptr) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_load_dword v0, v[0:1], off +; CHECK-NEXT: s_mov_b64 s[4:5], exec ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_cbranch_execz .LBB3_2 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB3_2 ; CHECK-NEXT: ; %bb.1: ; %if.true ; CHECK-NEXT: global_load_dword v0, v[0:1], off glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: .LBB3_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB3_2: ; %endif ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %value = load i32, ptr addrspace(1) %ptr @@ -212,8 +220,9 @@ define amdgpu_kernel void @break_loop(i32 %arg) { ; CHECK-NEXT: ; in Loop: Header=BB5_3 Depth=1 ; CHECK-NEXT: s_and_b64 s[4:5], exec, s[2:3] ; CHECK-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] -; CHECK-NEXT: s_cbranch_execz .LBB5_5 +; CHECK-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; CHECK-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; CHECK-NEXT: s_cbranch_scc0 .LBB5_5 ; CHECK-NEXT: .LBB5_3: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_u32_e32 v1, 1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index eb39ca2d7daa7f..d017523d574cea 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1069,8 +1069,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB38_2 +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB38_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1093,8 +1094,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB38_2 +; GFX940-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB38_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1121,8 +1123,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB39_2 +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB39_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1143,8 +1146,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB39_2 +; GFX940-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB39_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1171,8 +1175,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB40_2 +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB40_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1195,8 +1200,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB40_2 +; GFX940-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB40_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1223,8 +1229,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB41_2 +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB41_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1245,8 +1252,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB41_2 +; GFX940-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB41_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1390,8 +1398,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB47_2 +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB47_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1412,8 +1421,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB47_2 +; GFX940-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB47_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1725,8 +1735,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB59_2 +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB59_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1746,8 +1757,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB59_2 +; GFX940-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB59_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1772,8 +1784,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB60_2 +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB60_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1793,8 +1806,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB60_2 +; GFX940-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB60_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1819,8 +1833,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB61_2 +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB61_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1840,8 +1855,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB61_2 +; GFX940-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB61_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll index fcd8c6fb0fe7c6..fa6f5aa28b40aa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -118,14 +118,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX908-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4.Flow: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5 (%ir-block.33): - ; GFX908-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw @@ -192,14 +192,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.Flow: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5 (%ir-block.33): - ; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0 ; ; GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw @@ -266,14 +266,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: bb.4.Flow: ; GFX940-NEXT: successors: %bb.5(0x80000000) ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: bb.5 (%ir-block.33): - ; GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX940-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw @@ -326,14 +326,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.4.Flow: ; GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.5 (%ir-block.26): - ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll index a8f9ed2e6fba93..bca645a80f49d9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -112,26 +112,26 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.Flow: ; GFX90A-NEXT: successors: %bb.6(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %43, %bb.5, [[DEF]], %bb.1 - ; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5 (%ir-block.35): ; GFX90A-NEXT: successors: %bb.4(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 - ; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX90A-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec ; GFX90A-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY21]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_6]], 0, [[COPY22]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX90A-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.4 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6 (%ir-block.41): @@ -205,26 +205,26 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX940-NEXT: S_BRANCH %bb.5 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: bb.4.Flow: ; GFX940-NEXT: successors: %bb.6(0x80000000) ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %42, %bb.5, [[DEF]], %bb.1 - ; GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX940-NEXT: S_BRANCH %bb.6 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: bb.5 (%ir-block.35): ; GFX940-NEXT: successors: %bb.4(0x80000000) ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 - ; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX940-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec ; GFX940-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY21]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec ; GFX940-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX940-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_6]], 0, [[COPY22]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX940-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX940-NEXT: S_BRANCH %bb.4 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: bb.6 (%ir-block.41): @@ -293,26 +293,26 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.5 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.4.Flow: ; GFX11-NEXT: successors: %bb.6(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %41, %bb.5, [[DEF]], %bb.1 - ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.6 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.5 (%ir-block.32): ; GFX11-NEXT: successors: %bb.4(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 - ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX11-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_5]], 0, [[COPY16]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX11-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.4 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.6 (%ir-block.38): diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll index 80fa24471a459f..c9dce8759e6274 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll @@ -21,7 +21,7 @@ define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) ; GFX90A-NEXT: successors: %bb.3(0x04000000), %bb.2(0x7c000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI %13, %bb.2, [[S_MOV_B64_]], %bb.1 - ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_LOAD_DWORDX2_]], %bb.1, %19, %bb.2 + ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_LOAD_DWORDX2_]], %bb.1, %18, %bb.2 ; GFX90A-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI1]], 0, [[REG_SEQUENCE1]], 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[V_ADD_F64_e64_]], %subreg.sub0_sub1, [[PHI1]], %subreg.sub2_sub3 ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1) @@ -31,8 +31,6 @@ define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.atomicrmw.end: - ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:sreg_64_xexec = PHI [[SI_IF_BREAK]], %bb.2 - ; GFX90A-NEXT: SI_END_CF [[PHI2]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0 ; ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw @@ -70,7 +68,7 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX90A-NEXT: successors: %bb.3(0x04000000), %bb.2(0x7c000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI %13, %bb.2, [[S_MOV_B64_]], %bb.1 - ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_LOAD_DWORDX2_]], %bb.1, %24, %bb.2 + ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_LOAD_DWORDX2_]], %bb.1, %23, %bb.2 ; GFX90A-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI1]], 0, [[REG_SEQUENCE1]], 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[V_ADD_F64_e64_]], %subreg.sub0_sub1, [[PHI1]], %subreg.sub2_sub3 ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1) @@ -81,8 +79,6 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.atomicrmw.end: ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]], %bb.2 - ; GFX90A-NEXT: [[PHI3:%[0-9]+]]:sreg_64_xexec = PHI [[SI_IF_BREAK]], %bb.2 - ; GFX90A-NEXT: SI_END_CF [[PHI3]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll index 10cbc56cc5fbea..e0eadf09c5b8db 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -18,7 +18,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v15, v1 @@ -62,21 +62,19 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: v_writelane_b32 v0, s5, 1 ; CHECK-NEXT: v_writelane_b32 v0, s6, 2 ; CHECK-NEXT: v_writelane_b32 v0, s7, 3 +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: v_mov_b32_e32 v2, s5 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b32 s4, exec_lo -; CHECK-NEXT: v_writelane_b32 v0, s4, 4 -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -111,14 +109,14 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: s_mov_b32 s17, s6 ; CHECK-NEXT: s_mov_b32 s18, s5 ; CHECK-NEXT: s_mov_b32 s19, s4 -; CHECK-NEXT: v_writelane_b32 v0, s12, 5 -; CHECK-NEXT: v_writelane_b32 v0, s13, 6 -; CHECK-NEXT: v_writelane_b32 v0, s14, 7 -; CHECK-NEXT: v_writelane_b32 v0, s15, 8 -; CHECK-NEXT: v_writelane_b32 v0, s16, 9 -; CHECK-NEXT: v_writelane_b32 v0, s17, 10 -; CHECK-NEXT: v_writelane_b32 v0, s18, 11 -; CHECK-NEXT: v_writelane_b32 v0, s19, 12 +; CHECK-NEXT: v_writelane_b32 v0, s12, 4 +; CHECK-NEXT: v_writelane_b32 v0, s13, 5 +; CHECK-NEXT: v_writelane_b32 v0, s14, 6 +; CHECK-NEXT: v_writelane_b32 v0, s15, 7 +; CHECK-NEXT: v_writelane_b32 v0, s16, 8 +; CHECK-NEXT: v_writelane_b32 v0, s17, 9 +; CHECK-NEXT: v_writelane_b32 v0, s18, 10 +; CHECK-NEXT: v_writelane_b32 v0, s19, 11 ; CHECK-NEXT: v_mov_b32_e32 v7, v9 ; CHECK-NEXT: v_mov_b32_e32 v8, v10 ; CHECK-NEXT: v_mov_b32_e32 v5, v11 @@ -139,46 +137,40 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2] ; CHECK-NEXT: s_and_b32 s4, s4, s5 ; CHECK-NEXT: s_and_saveexec_b32 s4, s4 -; CHECK-NEXT: v_writelane_b32 v0, s4, 13 +; CHECK-NEXT: v_writelane_b32 v0, s4, 12 ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s4, v2, 13 -; CHECK-NEXT: v_readlane_b32 s8, v2, 5 -; CHECK-NEXT: v_readlane_b32 s9, v2, 6 -; CHECK-NEXT: v_readlane_b32 s10, v2, 7 -; CHECK-NEXT: v_readlane_b32 s11, v2, 8 -; CHECK-NEXT: v_readlane_b32 s12, v2, 9 -; CHECK-NEXT: v_readlane_b32 s13, v2, 10 -; CHECK-NEXT: v_readlane_b32 s14, v2, 11 -; CHECK-NEXT: v_readlane_b32 s15, v2, 12 +; CHECK-NEXT: v_readlane_b32 s5, v2, 12 +; CHECK-NEXT: v_readlane_b32 s8, v2, 4 +; CHECK-NEXT: v_readlane_b32 s9, v2, 5 +; CHECK-NEXT: v_readlane_b32 s10, v2, 6 +; CHECK-NEXT: v_readlane_b32 s11, v2, 7 +; CHECK-NEXT: v_readlane_b32 s12, v2, 8 +; CHECK-NEXT: v_readlane_b32 s13, v2, 9 +; CHECK-NEXT: v_readlane_b32 s14, v2, 10 +; CHECK-NEXT: v_readlane_b32 s15, v2, 11 ; CHECK-NEXT: v_readlane_b32 s16, v2, 0 ; CHECK-NEXT: v_readlane_b32 s17, v2, 1 ; CHECK-NEXT: v_readlane_b32 s18, v2, 2 ; CHECK-NEXT: v_readlane_b32 s19, v2, 3 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: s_xor_b32 s4, exec_lo, s5 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s5 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s4, v0, 4 -; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll index d4a7f3b2d387d0..d84e9723b58469 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll @@ -34,15 +34,13 @@ define float @test_atomicrmw_fsub(ptr addrspace(3) %addr) { ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %13(s32), %bb.2 ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[PHI1]], [[C]] ; CHECK-NEXT: [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[PHI1]], [[FSUB]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3) - ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) - ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64) - ; CHECK-NEXT: G_BRCOND [[INT1]](s1), %bb.3 + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:%[0-9]+]]:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INTRINSIC_CONVERGENT]](s64) + ; CHECK-NEXT: G_BRCOND [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS]](s1), %bb.3 ; CHECK-NEXT: G_BR %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.atomicrmw.end: ; CHECK-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32), %bb.2 - ; CHECK-NEXT: [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2 - ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64) ; CHECK-NEXT: $vgpr0 = COPY [[PHI2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %oldval = atomicrmw fsub ptr addrspace(3) %addr, float 1.0 seq_cst @@ -87,15 +85,13 @@ define <2 x half> @test_atomicrmw_fsub_vector(ptr addrspace(3) %addr) { ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>) ; CHECK-NEXT: [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[BITCAST1]], [[BITCAST]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3) ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32) - ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) - ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64) - ; CHECK-NEXT: G_BRCOND [[INT1]](s1), %bb.3 + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:%[0-9]+]]:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INTRINSIC_CONVERGENT]](s64) + ; CHECK-NEXT: G_BRCOND [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS]](s1), %bb.3 ; CHECK-NEXT: G_BR %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.atomicrmw.end: ; CHECK-NEXT: [[PHI2:%[0-9]+]]:_(<2 x s16>) = G_PHI [[BITCAST2]](<2 x s16>), %bb.2 - ; CHECK-NEXT: [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2 - ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64) ; CHECK-NEXT: $vgpr0 = COPY [[PHI2]](<2 x s16>) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %oldval = atomicrmw fsub ptr addrspace(3) %addr, <2 x half> seq_cst @@ -125,15 +121,13 @@ define <2 x half> @test_atomicrmw_fmin_vector(ptr addrspace(3) %addr) { ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>) ; CHECK-NEXT: [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[BITCAST1]], [[BITCAST]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3) ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32) - ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) - ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64) - ; CHECK-NEXT: G_BRCOND [[INT1]](s1), %bb.3 + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:%[0-9]+]]:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INTRINSIC_CONVERGENT]](s64) + ; CHECK-NEXT: G_BRCOND [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS]](s1), %bb.3 ; CHECK-NEXT: G_BR %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.atomicrmw.end: ; CHECK-NEXT: [[PHI2:%[0-9]+]]:_(<2 x s16>) = G_PHI [[BITCAST2]](<2 x s16>), %bb.2 - ; CHECK-NEXT: [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2 - ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64) ; CHECK-NEXT: $vgpr0 = COPY [[PHI2]](<2 x s16>) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %oldval = atomicrmw fmin ptr addrspace(3) %addr, <2 x half> seq_cst @@ -163,15 +157,13 @@ define <2 x half> @test_atomicrmw_fmax_vector(ptr addrspace(3) %addr) { ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>) ; CHECK-NEXT: [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[BITCAST1]], [[BITCAST]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3) ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32) - ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) - ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64) - ; CHECK-NEXT: G_BRCOND [[INT1]](s1), %bb.3 + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:%[0-9]+]]:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INTRINSIC_CONVERGENT]](s64) + ; CHECK-NEXT: G_BRCOND [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS]](s1), %bb.3 ; CHECK-NEXT: G_BR %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.atomicrmw.end: ; CHECK-NEXT: [[PHI2:%[0-9]+]]:_(<2 x s16>) = G_PHI [[BITCAST2]](<2 x s16>), %bb.2 - ; CHECK-NEXT: [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2 - ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64) ; CHECK-NEXT: $vgpr0 = COPY [[PHI2]](<2 x s16>) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %oldval = atomicrmw fmax ptr addrspace(3) %addr, <2 x half> seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll index 6d32d4c720c991..7dbb60a45d3180 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll @@ -97,18 +97,18 @@ define void @i1_arg_i1_use(i1 %arg) #0 { ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[TRUNC]], [[C]] - ; CHECK-NEXT: [[INTRINSIC_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1) - ; CHECK-NEXT: G_BRCOND [[INTRINSIC_W_SIDE_EFFECTS]](s1), %bb.2 + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1) + ; CHECK-NEXT: G_BRCOND [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS]](s1), %bb.2 ; CHECK-NEXT: G_BR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.bb1: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_STORE [[C1]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; CHECK-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS1]](s64) ; CHECK-NEXT: G_BR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.bb2: - ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INTRINSIC_W_SIDE_EFFECTS1]](s64) ; CHECK-NEXT: SI_RETURN bb: br i1 %arg, label %bb2, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll index d3bc661f5940b6..fcbacf52327233 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll @@ -41,8 +41,7 @@ define amdgpu_ps void @_amdgpu_ps_main(i1 %arg) { ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_3: ; %bb6 ; CHECK-NEXT: ; in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; CHECK-NEXT: s_and_b32 s2, 1, s2 +; CHECK-NEXT: s_and_b32 s2, 1, s3 ; CHECK-NEXT: v_or_b32_e32 v1, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s2, 0, s2 ; CHECK-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v0 @@ -54,12 +53,15 @@ define amdgpu_ps void @_amdgpu_ps_main(i1 %arg) { ; CHECK-NEXT: s_cbranch_vccz .LBB0_1 ; CHECK-NEXT: .LBB0_4: ; %bb2 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_mov_b32 s2, 0 -; CHECK-NEXT: s_and_saveexec_b32 s3, s0 -; CHECK-NEXT: s_cbranch_execz .LBB0_3 +; CHECK-NEXT: s_mov_b32 s2, exec_lo +; CHECK-NEXT: s_and_b32 s4, s0, exec_lo +; CHECK-NEXT: s_mov_b32 s3, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, s4 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_3 ; CHECK-NEXT: ; %bb.5: ; %bb5 ; CHECK-NEXT: ; in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: s_mov_b32 s2, 1 +; CHECK-NEXT: s_mov_b32 s3, 1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; CHECK-NEXT: s_branch .LBB0_3 bb: %i = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir index 9716bb31db3fdc..bfc398dba3a959 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir @@ -2,7 +2,7 @@ # Make sure there's no crash if there is somehow no successor block. -# ERR: remark: :0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_no_succ_block) +# ERR: remark: :0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_no_succ_block) --- name: brcond_si_if_no_succ_block @@ -16,6 +16,6 @@ body: | %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s1) = G_ICMP intpred(ne), %0, %1 - %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 + %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 G_BRCOND %3, %bb.1 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir index 57bbe020dca850..b85aa15ad508aa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir @@ -150,7 +150,7 @@ body: | %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s1) = G_ICMP intpred(ne), %0, %1 - %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 + %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 G_BRCOND %3, %bb.1 bb.1: @@ -189,7 +189,7 @@ body: | %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s1) = G_ICMP intpred(ne), %0, %1 - %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %2 + %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %2 G_BRCOND %3, %bb.1 bb.1: @@ -244,7 +244,7 @@ body: | bb.1: successors: %bb.1, %bb.2 S_NOP 0 - %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 + %3:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 G_BRCOND %3, %bb.2 G_BR %bb.1 @@ -303,7 +303,7 @@ body: | bb.1: successors: %bb.1, %bb.2 S_NOP 0 - %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 + %3:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 G_BRCOND %3, %bb.1 G_BR %bb.2 @@ -360,7 +360,7 @@ body: | bb.1: successors: %bb.1, %bb.2 S_NOP 0 - %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 + %3:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 G_BRCOND %3, %bb.1 bb.2: @@ -405,7 +405,7 @@ body: | %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s1) = G_ICMP intpred(ne), %0, %1 - %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 + %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 %5:_(s32) = COPY $vgpr2 G_BRCOND %3, %bb.1 @@ -466,7 +466,7 @@ body: | bb.1: successors: %bb.1, %bb.2 S_NOP 0 - %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 + %3:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 S_NOP 0 S_NOP 0 G_BRCOND %3, %bb.2 @@ -521,7 +521,7 @@ body: | %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s1) = G_ICMP intpred(ne), %0, %1 - %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 + %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 %5:_(s1) = G_CONSTANT i1 true %6:_(s1) = G_XOR %3, %5 G_BRCOND %6, %bb.2 @@ -588,7 +588,7 @@ body: | %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s1) = G_ICMP intpred(ne), %0, %1 - %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 + %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2 %5:_(s1) = G_CONSTANT i1 true %6:_(s1) = G_XOR %3, %5 G_BRCOND %6, %bb.2 @@ -653,7 +653,7 @@ body: | bb.1: successors: %bb.1, %bb.2 S_NOP 0 - %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 + %3:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 %4:_(s1) = G_CONSTANT i1 true %5:_(s1) = G_XOR %3, %4 G_BRCOND %5, %bb.1 @@ -711,7 +711,7 @@ body: | bb.1: successors: %bb.1, %bb.2 S_NOP 0 - %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 + %3:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2 %4:_(s1) = G_CONSTANT i1 true %5:_(s1) = G_XOR %3, %4 G_BRCOND %5, %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 614f59c564df64..25416d164d2468 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -157,7 +157,6 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX1030-NEXT: v_mov_b32_e32 v31, v10 ; GFX1030-NEXT: v_mov_b32_e32 v19, v11 ; GFX1030-NEXT: v_mov_b32_e32 v20, v12 -; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v19 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v20 @@ -168,6 +167,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[21:31], s[4:7] +; GFX1030-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr21 ; GFX1030-NEXT: ; implicit-def: $vgpr22 @@ -181,10 +181,9 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX1030-NEXT: ; implicit-def: $vgpr30 ; GFX1030-NEXT: ; implicit-def: $vgpr31 ; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 -; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1030-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1030-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1030-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1030-NEXT: ; %bb.2: -; GFX1030-NEXT: s_mov_b32 exec_lo, s1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ; return to shader part epilog ; @@ -192,7 +191,6 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX1013: ; %bb.0: ; GFX1013-NEXT: v_mov_b32_e32 v19, v11 ; GFX1013-NEXT: v_mov_b32_e32 v20, v12 -; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v19 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v20 @@ -203,14 +201,14 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh_intersect_ray v[15:18], v[0:10], s[4:7] +; GFX1013-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1013-NEXT: ; implicit-def: $vgpr19 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; GFX1013-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1013-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1013-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1013-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1013-NEXT: ; %bb.2: -; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, v15 ; GFX1013-NEXT: v_mov_b32_e32 v1, v16 @@ -224,7 +222,6 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v16, v3 ; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v18, v11 ; GFX11-NEXT: v_mov_b32_e32 v19, v12 -; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s4, v18 @@ -238,6 +235,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v20, v21, v[15:17], v[5:7], v[8:10]], s[4:7] +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr18 ; GFX11-NEXT: ; implicit-def: $vgpr20 ; GFX11-NEXT: ; implicit-def: $vgpr21 @@ -245,10 +243,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7 ; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10 ; GFX11-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) @@ -259,46 +257,45 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v18, v0 -; GFX1030-NEXT: v_mov_b32_e32 v19, v1 +; GFX1030-NEXT: v_mov_b32_e32 v16, v0 +; GFX1030-NEXT: v_mov_b32_e32 v17, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; GFX1030-NEXT: v_mov_b32_e32 v20, v2 +; GFX1030-NEXT: v_mov_b32_e32 v18, v2 ; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v8 -; GFX1030-NEXT: v_mov_b32_e32 v21, v3 +; GFX1030-NEXT: v_mov_b32_e32 v19, v3 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mov_b32_e32 v22, v4 -; GFX1030-NEXT: v_mov_b32_e32 v16, v9 -; GFX1030-NEXT: v_mov_b32_e32 v17, v10 -; GFX1030-NEXT: v_and_or_b32 v23, 0xffff, v5, v0 -; GFX1030-NEXT: v_and_or_b32 v24, 0xffff, v6, v1 -; GFX1030-NEXT: v_alignbit_b32 v25, v2, v7, 16 -; GFX1030-NEXT: s_mov_b32 s1, exec_lo +; GFX1030-NEXT: v_mov_b32_e32 v20, v4 +; GFX1030-NEXT: v_mov_b32_e32 v14, v9 +; GFX1030-NEXT: v_mov_b32_e32 v15, v10 +; GFX1030-NEXT: v_and_or_b32 v21, 0xffff, v5, v0 +; GFX1030-NEXT: v_and_or_b32 v22, 0xffff, v6, v1 +; GFX1030-NEXT: v_alignbit_b32 v23, v2, v7, 16 ; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v16 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v17 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v14 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v15 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v11 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v12 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17] +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15] ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[18:25], s[4:7] a16 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[16:23], s[4:7] a16 +; GFX1030-NEXT: s_xor_b32 s1, exec_lo, s0 +; GFX1030-NEXT: ; implicit-def: $vgpr14 ; GFX1030-NEXT: ; implicit-def: $vgpr16 +; GFX1030-NEXT: ; implicit-def: $vgpr17 ; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 ; GFX1030-NEXT: ; implicit-def: $vgpr21 ; GFX1030-NEXT: ; implicit-def: $vgpr22 ; GFX1030-NEXT: ; implicit-def: $vgpr23 -; GFX1030-NEXT: ; implicit-def: $vgpr24 -; GFX1030-NEXT: ; implicit-def: $vgpr25 ; GFX1030-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 -; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1030-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1030-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1030-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1030-NEXT: ; %bb.2: -; GFX1030-NEXT: s_mov_b32 exec_lo, s1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ; return to shader part epilog ; @@ -309,7 +306,6 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; GFX1013-NEXT: v_and_b32_e32 v10, 0xffff, v7 ; GFX1013-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16 @@ -325,14 +321,14 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh_intersect_ray v[13:16], v[0:7], s[4:7] a16 +; GFX1013-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1013-NEXT: ; implicit-def: $vgpr17 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1013-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1013-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1013-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1013-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1013-NEXT: ; %bb.2: -; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, v13 ; GFX1013-NEXT: v_mov_b32_e32 v1, v14 @@ -351,29 +347,29 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX11-NEXT: v_lshl_or_b32 v4, v5, 16, v0 ; GFX11-NEXT: v_perm_b32 v5, v5, v7, 0x7060302 ; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v1 -; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_readfirstlane_b32 s4, v18 ; GFX11-NEXT: v_readfirstlane_b32 s5, v19 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v16, v17, v[13:15], v[4:6]], s[4:7] a16 +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr18 ; GFX11-NEXT: ; implicit-def: $vgpr16 ; GFX11-NEXT: ; implicit-def: $vgpr17 ; GFX11-NEXT: ; implicit-def: $vgpr13_vgpr14_vgpr15 ; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) @@ -398,7 +394,6 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX1030-NEXT: v_mov_b32_e32 v33, v11 ; GFX1030-NEXT: v_mov_b32_e32 v20, v12 ; GFX1030-NEXT: v_mov_b32_e32 v21, v13 -; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v20 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v21 @@ -409,6 +404,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[22:33], s[4:7] +; GFX1030-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1030-NEXT: ; implicit-def: $vgpr20 ; GFX1030-NEXT: ; implicit-def: $vgpr22 ; GFX1030-NEXT: ; implicit-def: $vgpr23 @@ -423,10 +419,9 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX1030-NEXT: ; implicit-def: $vgpr32 ; GFX1030-NEXT: ; implicit-def: $vgpr33 ; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 -; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1030-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1030-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1030-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1030-NEXT: ; %bb.2: -; GFX1030-NEXT: s_mov_b32 exec_lo, s1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ; return to shader part epilog ; @@ -434,7 +429,6 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX1013: ; %bb.0: ; GFX1013-NEXT: v_mov_b32_e32 v20, v12 ; GFX1013-NEXT: v_mov_b32_e32 v21, v13 -; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v20 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v21 @@ -445,14 +439,14 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh64_intersect_ray v[16:19], v[0:11], s[4:7] +; GFX1013-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1013-NEXT: ; implicit-def: $vgpr20 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX1013-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1013-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1013-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1013-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1013-NEXT: ; %bb.2: -; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, v16 ; GFX1013-NEXT: v_mov_b32_e32 v1, v17 @@ -466,7 +460,6 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX11-NEXT: v_dual_mov_b32 v21, v2 :: v_dual_mov_b32 v16, v3 ; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v18, v5 ; GFX11-NEXT: v_dual_mov_b32 v4, v12 :: v_dual_mov_b32 v5, v13 -; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s4, v4 @@ -480,6 +473,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7] +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: ; implicit-def: $vgpr19_vgpr20 ; GFX11-NEXT: ; implicit-def: $vgpr21 @@ -487,10 +481,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11 ; GFX11-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) @@ -517,7 +511,6 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1030-NEXT: v_and_or_b32 v25, 0xffff, v6, v0 ; GFX1030-NEXT: v_and_or_b32 v26, 0xffff, v7, v1 ; GFX1030-NEXT: v_alignbit_b32 v27, v2, v8, 16 -; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v17 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v18 @@ -528,6 +521,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[19:27], s[4:7] a16 +; GFX1030-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1030-NEXT: ; implicit-def: $vgpr17 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 @@ -539,10 +533,9 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1030-NEXT: ; implicit-def: $vgpr26 ; GFX1030-NEXT: ; implicit-def: $vgpr27 ; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 -; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1030-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1030-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1030-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1030-NEXT: ; %bb.2: -; GFX1030-NEXT: s_mov_b32 exec_lo, s1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ; return to shader part epilog ; @@ -553,7 +546,6 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6 ; GFX1013-NEXT: v_and_b32_e32 v11, 0xffff, v8 ; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 @@ -569,14 +561,14 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh64_intersect_ray v[14:17], v[0:8], s[4:7] a16 +; GFX1013-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1013-NEXT: ; implicit-def: $vgpr18 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1013-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1013-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1013-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1013-NEXT: ; %bb.2: -; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, v14 ; GFX1013-NEXT: v_mov_b32_e32 v1, v15 @@ -595,29 +587,29 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX11-NEXT: v_lshl_or_b32 v20, v6, 16, v0 ; GFX11-NEXT: v_perm_b32 v21, v6, v8, 0x7060302 ; GFX11-NEXT: v_lshl_or_b32 v22, v7, 16, v1 -; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_readfirstlane_b32 s4, v4 ; GFX11-NEXT: v_readfirstlane_b32 s5, v5 ; GFX11-NEXT: v_readfirstlane_b32 s6, v12 ; GFX11-NEXT: v_readfirstlane_b32 s7, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[20:22]], s[4:7] a16 +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: ; implicit-def: $vgpr17_vgpr18 ; GFX11-NEXT: ; implicit-def: $vgpr19 ; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16 ; GFX11-NEXT: ; implicit-def: $vgpr20_vgpr21_vgpr22 ; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll index 4a151aeca87e43..cc0084d84053a3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll @@ -187,7 +187,6 @@ define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i3 ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_AND_OR_B32_e64_]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -210,14 +209,12 @@ define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i3 ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY7]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll index 570a39d0fa5fb4..dcbcea930da1d3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll @@ -179,7 +179,6 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -205,14 +204,12 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -232,7 +229,6 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -258,14 +254,12 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -291,7 +285,6 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -317,14 +310,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -343,7 +334,6 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -369,14 +359,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll index c2799e5836a974..024607ffac9df6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll @@ -105,7 +105,6 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -133,14 +132,12 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; GFX8-NEXT: [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX8-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN]].sub0 - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[COPY15]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -162,7 +159,6 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -190,14 +186,12 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0 - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -225,7 +219,6 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -252,14 +245,12 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -280,7 +271,6 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -307,14 +297,12 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -497,7 +485,6 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -525,14 +512,12 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX8-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3 ; GFX8-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX8-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1 - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0 ; GFX8-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1 @@ -563,7 +548,6 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -591,14 +575,12 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3 ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1 - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0 ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1 @@ -635,7 +617,6 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__ ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -662,14 +643,12 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3 ; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -694,7 +673,6 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -721,14 +699,12 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__ ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3 ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll index c96fc017ae936a..7209795fa5fb47 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -154,7 +154,6 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -180,14 +179,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -206,7 +203,6 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -232,14 +228,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -261,7 +255,6 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -287,14 +280,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -311,7 +302,6 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -337,14 +327,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll index 36d5e914d40bed..a739fbc002a2b1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll @@ -210,7 +210,6 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -236,14 +235,12 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -261,7 +258,6 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -287,14 +283,12 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -312,7 +306,6 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -338,14 +331,12 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_VBUFFER_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll index 23efaa4d2bd91e..d010b948b48b73 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll @@ -184,7 +184,6 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -210,14 +209,12 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -235,7 +232,6 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -261,14 +257,12 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll index 102a9bd840b091..f5a61a67055418 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll @@ -89,7 +89,6 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -112,14 +111,12 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -136,7 +133,6 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -159,14 +155,12 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -188,7 +182,6 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -214,14 +207,12 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -238,7 +229,6 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -264,14 +254,12 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -894,7 +882,6 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -917,14 +904,12 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -941,7 +926,6 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -964,14 +948,12 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -993,7 +975,6 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -1016,14 +997,12 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1040,7 +1019,6 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1063,14 +1041,12 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1487,7 +1463,6 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def dead $scc - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -1510,14 +1485,12 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1536,7 +1509,6 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def dead $scc - ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1559,14 +1531,12 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1592,7 +1562,6 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -1615,14 +1584,12 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1639,7 +1606,6 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1662,14 +1628,12 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll index 6541085b72e549..76f46e073c4eae 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll @@ -233,7 +233,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -256,14 +255,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -282,7 +279,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -305,14 +301,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -331,7 +325,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -354,14 +347,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -668,7 +659,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -691,14 +681,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -720,7 +708,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -743,14 +730,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -769,7 +754,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -792,14 +776,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 4096, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll index 1f89150f09ced8..df8cfe8e396a42 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll @@ -207,7 +207,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -230,14 +229,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -258,7 +255,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -281,14 +277,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_FORMAT_XYZW_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -514,7 +508,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -537,14 +530,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -565,7 +556,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -588,14 +578,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_FORMAT_XYZW_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4096, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll index 030f8dae0ef790..12a2741ecc961d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -92,7 +92,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -115,14 +114,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -139,7 +136,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -162,14 +158,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -191,7 +185,6 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -205,14 +198,12 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -229,7 +220,6 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -243,14 +233,12 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -272,7 +260,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -298,14 +285,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -322,7 +307,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -348,14 +332,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -912,7 +894,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -935,14 +916,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -961,7 +940,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -984,14 +962,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1366,7 +1342,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -1389,14 +1364,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1413,7 +1386,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1436,14 +1408,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 5000 @@ -1467,7 +1437,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -1490,14 +1459,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1513,7 +1480,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1536,14 +1502,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 5000, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll index 93d68443c78431..ee424f5ce36b04 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll @@ -108,7 +108,6 @@ define amdgpu_ps float @raw_ptr_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -134,14 +133,12 @@ define amdgpu_ps float @raw_ptr_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -167,7 +164,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc_ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -193,14 +189,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc_ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll index 56b2d0452dd45f..47a8507ba3f88e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll @@ -67,7 +67,6 @@ define amdgpu_ps float @raw_ptr_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__v ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -95,14 +94,12 @@ define amdgpu_ps float @raw_ptr_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__v ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN]].sub0 - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[COPY15]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -130,7 +127,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_c ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -157,14 +153,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_c ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -279,7 +273,6 @@ define amdgpu_ps double @raw_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -307,14 +300,12 @@ define amdgpu_ps double @raw_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__ ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3 ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1 - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0 ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1 @@ -351,7 +342,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_c ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -378,14 +368,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_c ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3 ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll index 999f42ff905ab7..d62d91fb0b914b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll @@ -154,7 +154,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -180,14 +179,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -206,7 +203,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -232,14 +228,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -261,7 +255,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -287,14 +280,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -311,7 +302,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -337,14 +327,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll index 5b19b1c913a94b..d7ec30a423fdeb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll @@ -161,7 +161,6 @@ define amdgpu_ps half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__ ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -187,14 +186,12 @@ define amdgpu_ps half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__ ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -212,7 +209,6 @@ define amdgpu_ps half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__ ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -238,14 +234,12 @@ define amdgpu_ps half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__ ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll index 2dc688db86e4fb..bbfe08bdd41081 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll @@ -108,7 +108,6 @@ define amdgpu_ps float @raw_ptr_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset_ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -134,14 +133,12 @@ define amdgpu_ps float @raw_ptr_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset_ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll index 7b8b028128dd3d..a87668c897a58a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -57,7 +57,6 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -80,14 +79,12 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -109,7 +106,6 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_s ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -135,14 +131,12 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_s ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -526,7 +520,6 @@ define amdgpu_ps half @raw_ptr_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_so ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -549,14 +542,12 @@ define amdgpu_ps half @raw_ptr_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_so ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -578,7 +569,6 @@ define amdgpu_ps float @raw_ptr_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_so ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -601,14 +591,12 @@ define amdgpu_ps float @raw_ptr_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_so ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8) from %ir.rsrc, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -857,7 +845,6 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -880,14 +867,12 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -913,7 +898,6 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -936,14 +920,12 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll index 3ed6bbdd361569..fe734b326ee8c4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll @@ -171,7 +171,6 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -194,14 +193,12 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -220,7 +217,6 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -243,14 +239,12 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -482,7 +476,6 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -505,14 +498,12 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -534,7 +525,6 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -557,14 +547,12 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll index dee83a9b0a6ece..32d82f4d71a87c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll @@ -123,7 +123,6 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -146,14 +145,12 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -294,7 +291,6 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -317,14 +313,12 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll index 2c99ce8694bcc1..141d2a92f3952b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -59,7 +59,6 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -82,14 +81,12 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -111,7 +108,6 @@ define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -125,14 +121,12 @@ define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -154,7 +148,6 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -180,14 +173,12 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -527,7 +518,6 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -550,14 +540,12 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -784,7 +772,6 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -807,14 +794,12 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 5000 @@ -838,7 +823,6 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -861,14 +845,12 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 5000, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll index a799e203d6439d..4e3872b36271ca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll @@ -159,7 +159,6 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -185,14 +184,12 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -210,7 +207,6 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -236,14 +232,12 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll index 3e135472ebbb18..3077c126713cae 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll @@ -107,7 +107,6 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -133,14 +132,12 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll index 725faa1b4a49f0..c3a21db93051cf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll @@ -138,7 +138,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -161,14 +160,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -185,7 +182,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -208,14 +204,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -237,7 +231,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -263,14 +256,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -287,7 +278,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -313,14 +303,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -343,7 +331,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -369,14 +356,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -394,7 +379,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -420,14 +404,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll index a12a6005df24ea..2f537de57a6a7f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll @@ -51,7 +51,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -74,14 +73,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -98,7 +95,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -121,14 +117,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.i8(i8 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -150,7 +144,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -176,14 +169,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -200,7 +191,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -226,14 +216,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.i8(i8 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -256,7 +244,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -282,14 +269,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -307,7 +292,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -333,14 +317,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.i8(i8 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll index 9db5c160a62363..5250e70469f4a6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll @@ -127,7 +127,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -150,14 +149,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 1, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 1) @@ -179,7 +176,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -205,14 +201,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -235,7 +229,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -261,14 +254,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -597,7 +588,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -620,14 +610,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %soffset = add i32 %soffset.base, 5000 @@ -653,7 +641,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -676,14 +663,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 904, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %voffset = add i32 %voffset.base, 5000 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll index 1cfb15391be367..6fe4060bbf4999 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll @@ -208,7 +208,6 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -234,14 +233,12 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -259,7 +256,6 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -285,14 +281,12 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -310,7 +304,6 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -336,14 +329,12 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_VBUFFER_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll index 4f8b20d10c8749..6ea107fe89141b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll @@ -183,7 +183,6 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX10_GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX10_GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX10_GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) @@ -209,14 +208,12 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX10_GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX10_GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.4: ; GFX10_GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.5: ; GFX10_GFX11-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX10_GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -234,7 +231,6 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -260,14 +256,12 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll index b9d0cb52d24054..5610d172929b57 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll @@ -186,7 +186,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -209,14 +208,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -233,7 +230,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -256,14 +252,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -280,7 +274,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -303,14 +296,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_D16_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -332,7 +323,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -358,14 +348,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -382,7 +370,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -408,14 +395,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -432,7 +417,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -458,14 +442,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_D16_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -488,7 +470,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -514,14 +495,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -539,7 +518,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -565,14 +543,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -590,7 +566,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -616,14 +591,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_D16_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll index c1fb4aacafe1d4..e9027db29a0d1d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll @@ -67,7 +67,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -90,14 +89,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -114,7 +111,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -137,14 +133,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -161,7 +155,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -184,14 +177,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -213,7 +204,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -239,14 +229,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -263,7 +251,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -289,14 +276,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -313,7 +298,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -339,14 +323,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -369,7 +351,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -395,14 +376,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -420,7 +399,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -446,14 +424,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -471,7 +447,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -497,14 +472,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll index 09227af922a6e9..8a688453ba048c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll @@ -213,7 +213,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX10_GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX10_GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX10_GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) @@ -236,14 +235,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX10_GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX10_GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.4: ; GFX10_GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.5: ; GFX10_GFX11-NEXT: S_ENDPGM 0 ; @@ -260,7 +257,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -283,14 +279,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 1) @@ -312,7 +306,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX10_GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX10_GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; GFX10_GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) @@ -338,14 +331,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX10_GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX10_GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.4: ; GFX10_GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.5: ; GFX10_GFX11-NEXT: S_ENDPGM 0 ; @@ -362,7 +353,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -388,14 +378,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -418,7 +406,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX10_GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX10_GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX10_GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) @@ -444,14 +431,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX10_GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX10_GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.4: ; GFX10_GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.5: ; GFX10_GFX11-NEXT: S_ENDPGM 0 ; @@ -469,7 +454,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -495,14 +479,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -1059,7 +1041,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX10_GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; GFX10_GFX11-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def dead $scc - ; GFX10_GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) @@ -1082,14 +1063,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX10_GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX10_GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.4: ; GFX10_GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] - ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.5: ; GFX10_GFX11-NEXT: S_ENDPGM 0 ; @@ -1108,7 +1087,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def dead $scc - ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1131,14 +1109,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %soffset = add i32 %soffset.base, 5000 @@ -1164,7 +1140,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX10_GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX10_GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; GFX10_GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) @@ -1187,14 +1162,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 904, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX10_GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX10_GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.4: ; GFX10_GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] - ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.5: ; GFX10_GFX11-NEXT: S_ENDPGM 0 ; @@ -1211,7 +1184,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1234,14 +1206,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 5000, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %voffset = add i32 %voffset.base, 5000 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll index 91cde52cd2d67a..2882560d71eae2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -3620,7 +3620,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -3643,14 +3642,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3668,7 +3665,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -3691,14 +3687,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3716,7 +3710,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -3739,14 +3732,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3763,7 +3754,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -3786,14 +3776,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY5]], [[REG_SEQUENCE1]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3814,7 +3802,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -3837,14 +3824,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3860,7 +3845,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -3883,14 +3867,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3906,7 +3888,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -3929,14 +3910,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3952,7 +3931,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -3975,14 +3953,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4008,7 +3984,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -4031,14 +4006,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4058,7 +4031,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX7-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -4081,14 +4053,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4108,7 +4078,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -4131,14 +4100,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4154,7 +4121,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -4177,14 +4143,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4096, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4206,7 +4170,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -4229,14 +4192,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4252,7 +4213,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -4275,14 +4235,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4298,7 +4256,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -4321,14 +4278,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4343,7 +4298,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -4366,14 +4320,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE1]], $sgpr_null, 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4396,7 +4348,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -4419,14 +4370,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4444,7 +4393,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -4467,14 +4415,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4490,7 +4436,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -4513,14 +4458,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4096) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4535,7 +4478,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -4558,14 +4500,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE1]], $sgpr_null, 4096, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4096) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4587,7 +4527,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -4611,14 +4550,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -4650,7 +4587,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -4674,14 +4610,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -4713,7 +4647,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -4737,14 +4670,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -4776,7 +4707,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -4800,14 +4730,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -4850,7 +4778,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -4874,14 +4801,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -4917,7 +4842,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX7-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -4941,14 +4865,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -4984,7 +4906,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -5008,14 +4929,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5047,7 +4966,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -5071,14 +4989,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4068, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4084, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5119,7 +5035,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -5143,14 +5058,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5186,7 +5099,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX7-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -5210,14 +5122,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5253,7 +5163,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -5277,14 +5186,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5316,7 +5223,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -5340,14 +5246,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4096, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4112, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5385,7 +5289,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -5409,14 +5312,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5449,7 +5350,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -5473,14 +5373,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5513,7 +5411,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4064 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -5537,14 +5434,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 936, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 952, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5576,7 +5471,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -5600,14 +5494,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], $sgpr_null, 5000, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], $sgpr_null, 5016, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5645,7 +5537,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -5669,14 +5560,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5709,7 +5598,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -5733,14 +5621,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5773,7 +5659,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 12 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -5797,14 +5682,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5836,7 +5719,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -5860,14 +5742,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], $sgpr_null, 4076, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], $sgpr_null, 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5905,7 +5785,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -5929,14 +5808,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5969,7 +5846,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -5993,14 +5869,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -6033,7 +5907,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -6057,14 +5930,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -6096,7 +5967,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -6120,14 +5990,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], $sgpr_null, 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], $sgpr_null, 4096, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -6164,7 +6032,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -6188,14 +6055,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -6227,7 +6092,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -6251,14 +6115,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -6290,7 +6152,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -6314,14 +6175,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -6352,7 +6211,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -6376,14 +6234,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], $sgpr_null, 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], $sgpr_null, 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll index ab720ce8f942c3..92058d35e5c34e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll @@ -197,7 +197,6 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -224,14 +223,12 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX8-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -253,7 +250,6 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -280,14 +276,12 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -315,7 +309,6 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -342,14 +335,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX8-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -370,7 +361,6 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -397,14 +387,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX12-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll index f9f70ecadfe60f..bf5e6347be52a7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll @@ -114,7 +114,6 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -143,14 +142,12 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX8-NEXT: [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX8-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN]].sub0 - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[COPY17]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -174,7 +171,6 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -203,14 +199,12 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0 - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -240,7 +234,6 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -268,14 +261,12 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1 ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -298,7 +289,6 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -326,14 +316,12 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1 ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -530,7 +518,6 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; GFX8-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -559,14 +546,12 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX8-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3 ; GFX8-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX8-NEXT: [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1 - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0 ; GFX8-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1 @@ -599,7 +584,6 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -628,14 +612,12 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX12-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3 ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1 - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0 ; GFX12-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1 @@ -674,7 +656,6 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; GFX8-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -702,14 +683,12 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm ; GFX8-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1 ; GFX8-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3 ; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -736,7 +715,6 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -764,14 +742,12 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1 ; GFX12-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3 ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll index 8589fe9fd056de..9b31ba8846d32b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -168,7 +168,6 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -195,14 +194,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -223,7 +220,6 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -250,14 +246,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -281,7 +275,6 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -307,14 +300,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -333,7 +324,6 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -359,14 +349,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll index 870588014cd29c..8a5b0392c8c2c1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll @@ -229,7 +229,6 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -256,14 +255,12 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 @@ -304,7 +301,6 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -331,14 +327,12 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 @@ -361,7 +355,6 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -388,14 +381,12 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN]].sub0 ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll index 06bd45a45cceda..2e2c4c8aa508b3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll @@ -202,7 +202,6 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -229,14 +228,12 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; GFX8-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 @@ -263,7 +260,6 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -290,14 +286,12 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN]].sub0 ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll index 94ce8aac8a4c68..3e5e6cdd9d375d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -324,7 +324,6 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -351,14 +350,12 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -378,7 +375,6 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -405,14 +401,12 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll index f62a15d470afd2..2bfe4c24405b40 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll @@ -207,7 +207,6 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -234,14 +233,12 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -262,7 +259,6 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -289,14 +285,12 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -317,7 +311,6 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -344,14 +337,12 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX12-NEXT: BUFFER_STORE_FORMAT_D16_X_VBUFFER_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll index 8a395f0e73222e..da8d9b71cdd9d7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll @@ -190,7 +190,6 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -217,14 +216,12 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX8-NEXT: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -245,7 +242,6 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -272,14 +268,12 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX12-NEXT: BUFFER_STORE_FORMAT_X_VBUFFER_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll index b89ed46ba05502..23daa6760c4179 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll @@ -195,7 +195,6 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -222,14 +221,12 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 ; GFX8-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -254,7 +251,6 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -281,14 +277,12 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll index 2e7323068d108f..1365d9e5e6d02f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll @@ -118,7 +118,6 @@ define amdgpu_ps float @struct_ptr_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__s ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -145,14 +144,12 @@ define amdgpu_ps float @struct_ptr_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__s ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -180,7 +177,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rs ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -207,14 +203,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rs ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll index e8e6cab4edbe89..275da33fb75252 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll @@ -73,7 +73,6 @@ define amdgpu_ps float @struct_ptr_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -102,14 +101,12 @@ define amdgpu_ps float @struct_ptr_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN]].sub0 - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[COPY17]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -139,7 +136,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgp ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -167,14 +163,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgp ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -297,7 +291,6 @@ define amdgpu_ps double @struct_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cm ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -326,14 +319,12 @@ define amdgpu_ps double @struct_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cm ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3 ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1 - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0 ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1 @@ -372,7 +363,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgp ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -400,14 +390,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgp ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3 ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll index 54657982493f78..7b1055a825a52a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll @@ -168,7 +168,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -195,14 +194,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -223,7 +220,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -250,14 +246,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -281,7 +275,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -307,14 +300,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -333,7 +324,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -359,14 +349,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll index 6c0319ef570d69..9cb643b1e79579 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll @@ -174,7 +174,6 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -201,14 +200,12 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 @@ -249,7 +246,6 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -276,14 +272,12 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll index 1e3f94a5e39cb1..ef7c1b8e39a21c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll @@ -118,7 +118,6 @@ define amdgpu_ps <4 x float> @struct_ptr_buffer_load_format_v4f32__vpr_rsrc__sgp ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -145,14 +144,12 @@ define amdgpu_ps <4 x float> @struct_ptr_buffer_load_format_v4f32__vpr_rsrc__sgp ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll index 66c62e9ce8a9ca..ff49ff2a6e80d2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll @@ -188,7 +188,6 @@ define amdgpu_ps float @struct_ptr_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -215,14 +214,12 @@ define amdgpu_ps float @struct_ptr_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll index 25fe7d2877ce34..0553df2b58c4af 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll @@ -153,7 +153,6 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f16__sgpr_val__vgpr_rsrc__ ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -180,14 +179,12 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f16__sgpr_val__vgpr_rsrc__ ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -208,7 +205,6 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f16__sgpr_val__vgpr_rsrc__ ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -235,14 +231,12 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f16__sgpr_val__vgpr_rsrc__ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.ptr.buffer.store.format.f16(half %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll index 3a4c258537814e..805e12fd56790b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll @@ -112,7 +112,6 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f32__sgpr_val__vgpr_rsrc__ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -139,14 +138,12 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f32__sgpr_val__vgpr_rsrc__ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.ptr.buffer.store.format.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll index 2e0a12b9d969cc..973ae17b0e948f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll @@ -117,7 +117,6 @@ define amdgpu_ps void @struct_ptr_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_v ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -144,14 +143,12 @@ define amdgpu_ps void @struct_ptr_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_v ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll index 1a57c2e77bddff..b4e9e5fbfbd112 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll @@ -215,7 +215,6 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -242,14 +241,12 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 @@ -272,7 +269,6 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -299,14 +295,12 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll index 63143ed718054d..6825c02e7e5631 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll @@ -140,7 +140,6 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -167,14 +166,12 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll index f270f87aae66d6..ea2c2f746c725d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -288,7 +288,6 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -315,14 +314,12 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 @@ -345,7 +342,6 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -372,14 +368,12 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX12-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) - ; GFX12-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN]].sub0 ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN]].sub1 @@ -402,7 +396,6 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -429,14 +422,12 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll index 7d3ecd363befbe..8140e5fc55b95f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll @@ -242,7 +242,6 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -269,14 +268,12 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 @@ -303,7 +300,6 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-GFX12-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-GFX12-NEXT: {{ $}} ; CHECK-GFX12-NEXT: bb.2: ; CHECK-GFX12-NEXT: successors: %bb.3(0x80000000) @@ -330,14 +326,12 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-GFX12-NEXT: {{ $}} ; CHECK-GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; CHECK-GFX12-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; CHECK-GFX12-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-GFX12-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-GFX12-NEXT: {{ $}} ; CHECK-GFX12-NEXT: bb.4: ; CHECK-GFX12-NEXT: successors: %bb.5(0x80000000) ; CHECK-GFX12-NEXT: {{ $}} - ; CHECK-GFX12-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-GFX12-NEXT: {{ $}} ; CHECK-GFX12-NEXT: bb.5: ; CHECK-GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN]].sub0 ; CHECK-GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wave.reconverge.i32.ll similarity index 95% rename from llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll rename to llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wave.reconverge.i32.ll index 6415e185446f53..4d2c9e575f18a7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wave.reconverge.i32.ll @@ -10,14 +10,14 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX10-NEXT: ; %bb.1: ; %mid +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: .LBB0_2: ; %bb -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: .LBB0_2: ; %bb ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -30,13 +30,13 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX11-NEXT: ; %bb.1: ; %mid +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: .LBB0_2: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: .LBB0_2: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -49,12 +49,12 @@ entry: mid: store volatile i32 0, ptr addrspace(1) undef + call void @llvm.amdgcn.wave.reconverge.i32(i32 %saved) br label %bb bb: - call void @llvm.amdgcn.end.cf.i32(i32 %saved) store volatile i32 0, ptr addrspace(1) undef ret void } -declare void @llvm.amdgcn.end.cf.i32(i32 %val) +declare void @llvm.amdgcn.wave.reconverge.i32(i32 %val) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wave.reconverge.i64.ll similarity index 86% rename from llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll rename to llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wave.reconverge.i64.ll index 06393857352b3a..5960ab89da7629 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wave.reconverge.i64.ll @@ -9,13 +9,12 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) { ; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %mid +; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: .LBB0_2: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-NEXT: .LBB0_2: ; %bb ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -26,12 +25,12 @@ entry: mid: store volatile i32 0, ptr addrspace(1) undef + call void @llvm.amdgcn.wave.reconverge.i64(i64 %saved) br label %bb bb: - call void @llvm.amdgcn.end.cf.i64(i64 %saved) store volatile i32 0, ptr addrspace(1) undef ret void } -declare void @llvm.amdgcn.end.cf.i64(i64 %val) +declare void @llvm.amdgcn.wave.reconverge.i64(i64 %val) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll index 0f60f40bd337be..ba3439c7e6523b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -159,21 +159,22 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; SI: ; %bb.0: ; %.entry ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: s_xor_b64 s[2:3], vcc, -1 -; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB2_3 +; SI-NEXT: s_xor_b64 s[4:5], vcc, -1 +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cmov_b64 exec, s[4:5] +; SI-NEXT: s_cbranch_scc0 .LBB2_3 ; SI-NEXT: ; %bb.1: ; %.demote -; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; SI-NEXT: s_cbranch_scc0 .LBB2_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: .LBB2_3: ; %.continue -; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; SI-NEXT: s_endpgm @@ -186,21 +187,22 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; %.demote -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: .LBB2_3: ; %.continue -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX9-NEXT: s_endpgm @@ -213,21 +215,22 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX10-32: ; %bb.0: ; %.entry ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_xor_b32 s1, vcc_lo, -1 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 -; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 -; GFX10-32-NEXT: s_cbranch_execz .LBB2_3 +; GFX10-32-NEXT: s_xor_b32 s2, vcc_lo, -1 +; GFX10-32-NEXT: s_and_b32 s2, s2, exec_lo +; GFX10-32-NEXT: s_cmov_b32 exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote -; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: .LBB2_3: ; %.continue -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-32-NEXT: s_endpgm @@ -240,21 +243,22 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX10-64: ; %bb.0: ; %.entry ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec ; GFX10-64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, -1 -; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; GFX10-64-NEXT: s_cbranch_execz .LBB2_3 +; GFX10-64-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX10-64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10-64-NEXT: s_cmov_b64 exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote -; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: .LBB2_3: ; %.continue -; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-64-NEXT: s_endpgm @@ -286,17 +290,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; SI-NEXT: s_mov_b64 s[12:13], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc -; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; SI-NEXT: s_cbranch_execz .LBB3_3 +; SI-NEXT: s_mov_b64 s[14:15], exec +; SI-NEXT: s_cmp_lg_u64 vcc, 0 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB3_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: .LBB3_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: .LBB3_3: ; %.continue ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v0 @@ -315,17 +320,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX9-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: .LBB3_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: .LBB3_3: ; %.continue ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 @@ -344,17 +350,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 -; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 -; GFX10-32-NEXT: s_cbranch_execz .LBB3_3 +; GFX10-32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s14, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10-32-NEXT: .LBB3_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: .LBB3_3: ; %.continue ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 @@ -373,17 +380,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_mov_b64 s[12:13], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX10-64-NEXT: s_cbranch_execz .LBB3_3 +; GFX10-64-NEXT: s_mov_b64 s[14:15], exec +; GFX10-64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: .LBB3_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: .LBB3_3: ; %.continue ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 @@ -420,19 +428,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; SI-NEXT: s_mov_b64 s[12:13], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_mov_b64 s[14:15], exec ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc -; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; SI-NEXT: s_cbranch_execz .LBB4_3 +; SI-NEXT: s_cmp_lg_u64 vcc, 0 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB4_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 .LBB4_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: .LBB4_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: .LBB4_3: ; %.continue ; SI-NEXT: v_add_f32_e32 v0, v0, v0 ; SI-NEXT: s_and_b64 exec, exec, s[12:13] ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf @@ -449,19 +458,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: .LBB4_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: .LBB4_3: ; %.continue ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf @@ -478,19 +488,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_mov_b32 s13, exec_lo ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 -; GFX10-32-NEXT: s_cbranch_execz .LBB4_3 +; GFX10-32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s14, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10-32-NEXT: .LBB4_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: .LBB4_3: ; %.continue ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -507,19 +518,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_mov_b64 s[12:13], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_mov_b64 s[14:15], exec ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX10-64-NEXT: s_cbranch_execz .LBB4_3 +; GFX10-64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: .LBB4_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: .LBB4_3: ; %.continue ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -663,39 +675,41 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB6_3 +; SI-NEXT: s_xor_b64 s[2:3], vcc, exec +; SI-NEXT: s_cmp_lg_u64 vcc, 0 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB6_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB6_7 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[4:5] -; SI-NEXT: .LBB6_3: ; %.continue0 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] -; SI-NEXT: s_mov_b64 s[2:3], s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; SI-NEXT: .LBB6_3: ; %.continue0 +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: s_nop 1 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_nop 0 ; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; SI-NEXT: s_and_b64 exec, exec, s[0:1] ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[2:3], s[0:1], vcc -; SI-NEXT: s_xor_b64 s[2:3], s[2:3], -1 -; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB6_6 +; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cmov_b64 exec, s[4:5] +; SI-NEXT: s_cbranch_scc0 .LBB6_6 ; SI-NEXT: ; %bb.4: ; %.demote1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB6_7 ; SI-NEXT: ; %bb.5: ; %.demote1 ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: .LBB6_6: ; %.continue1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: .LBB6_6: ; %.continue1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 ; SI-NEXT: v_bfrev_b32_e32 v1, 60 ; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -711,39 +725,41 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB6_3: ; %.continue0 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX9-NEXT: .LBB6_3: ; %.continue0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], -1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_6 +; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX9-NEXT: ; %bb.4: ; %.demote1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX9-NEXT: ; %bb.5: ; %.demote1 ; GFX9-NEXT: s_mov_b64 exec, 0 -; GFX9-NEXT: .LBB6_6: ; %.continue1 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB6_6: ; %.continue1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -759,37 +775,39 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10-32-NEXT: s_cbranch_execz .LBB6_3 +; GFX10-32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10-32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s2, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 -; GFX10-32-NEXT: .LBB6_3: ; %.continue0 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10-32-NEXT: s_mov_b32 s1, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 +; GFX10-32-NEXT: .LBB6_3: ; %.continue0 +; GFX10-32-NEXT: s_mov_b32 s2, s0 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_b32 s1, s0, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s1, s1, -1 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 -; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 -; GFX10-32-NEXT: s_cbranch_execz .LBB6_6 +; GFX10-32-NEXT: s_and_b32 s2, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s2, s2, -1 +; GFX10-32-NEXT: s_and_b32 s2, s2, exec_lo +; GFX10-32-NEXT: s_cmov_b32 exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX10-32-NEXT: ; %bb.4: ; %.demote1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-32-NEXT: ; %bb.5: ; %.demote1 ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-32-NEXT: .LBB6_6: ; %.continue1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: .LBB6_6: ; %.continue1 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -805,37 +823,39 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-64-NEXT: s_cbranch_execz .LBB6_3 +; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10-64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: .LBB6_3: ; %.continue0 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX10-64-NEXT: .LBB6_3: ; %.continue0 +; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] ; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_b64 s[2:3], s[0:1], vcc -; GFX10-64-NEXT: s_xor_b64 s[2:3], s[2:3], -1 -; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; GFX10-64-NEXT: s_cbranch_execz .LBB6_6 +; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GFX10-64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10-64-NEXT: s_cmov_b64 exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX10-64-NEXT: ; %bb.4: ; %.demote1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-64-NEXT: ; %bb.5: ; %.demote1 ; GFX10-64-NEXT: s_mov_b64 exec, 0 -; GFX10-64-NEXT: .LBB6_6: ; %.continue1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: .LBB6_6: ; %.continue1 ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -887,44 +907,46 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB7_3 +; SI-NEXT: s_xor_b64 s[2:3], vcc, exec +; SI-NEXT: s_cmp_lg_u64 vcc, 0 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB7_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[6:7] -; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_branch .LBB7_5 ; SI-NEXT: .LBB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; SI-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB7_8 +; SI-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; SI-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; SI-NEXT: s_cbranch_scc0 .LBB7_8 ; SI-NEXT: .LBB7_5: ; %.continue0 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] +; SI-NEXT: s_mov_b64 s[6:7], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] ; SI-NEXT: v_mov_b32_e32 v3, v2 -; SI-NEXT: s_nop 1 +; SI-NEXT: s_mov_b64 s[4:5], exec +; SI-NEXT: s_nop 0 ; SI-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; SI-NEXT: s_cbranch_execz .LBB7_4 +; SI-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; SI-NEXT: s_xor_b64 s[6:7], s[6:7], -1 +; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_cmov_b64 exec, s[6:7] +; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -933,9 +955,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_8: ; %.return -; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_and_b64 exec, exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 ; SI-NEXT: v_bfrev_b32_e32 v1, 60 @@ -953,44 +975,46 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] -; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_branch .LBB7_5 ; GFX9-NEXT: .LBB7_4: ; %.continue1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB7_8 +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc0 .LBB7_8 ; GFX9-NEXT: .LBB7_5: ; %.continue0 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], -1 +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX9-NEXT: ; %bb.6: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -999,9 +1023,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_branch .LBB7_4 ; GFX9-NEXT: .LBB7_8: ; %.return -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 @@ -1019,41 +1043,43 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: s_mov_b32 s1, 0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 -; GFX10-32-NEXT: s_cbranch_execz .LBB7_3 +; GFX10-32-NEXT: s_xor_b32 s2, vcc_lo, exec_lo +; GFX10-32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 -; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-32-NEXT: s_branch .LBB7_5 ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1 ; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 -; GFX10-32-NEXT: s_cbranch_execz .LBB7_8 +; GFX10-32-NEXT: s_andn2_b32 s2, exec_lo, s1 +; GFX10-32-NEXT: s_cselect_b32 exec_lo, s2, s1 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_8 ; GFX10-32-NEXT: .LBB7_5: ; %.continue0 ; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-32-NEXT: s_mov_b32 s2, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 +; GFX10-32-NEXT: s_mov_b32 s3, s0 +; GFX10-32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s3 ; GFX10-32-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX10-32-NEXT: s_and_b32 s2, s0, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s2, s2, -1 -; GFX10-32-NEXT: s_and_saveexec_b32 s3, s2 -; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s3 -; GFX10-32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10-32-NEXT: s_and_b32 s3, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s3, s3, -1 +; GFX10-32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX10-32-NEXT: s_cmov_b32 exec_lo, s3 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10-32-NEXT: ; %bb.6: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo @@ -1062,9 +1088,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: s_branch .LBB7_4 ; GFX10-32-NEXT: .LBB7_8: ; %.return -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 @@ -1082,42 +1108,44 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: s_mov_b32 s4, 0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-64-NEXT: s_cbranch_execz .LBB7_3 +; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10-64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] -; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-64-NEXT: s_branch .LBB7_5 ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: s_cbranch_execz .LBB7_8 +; GFX10-64-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX10-64-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_8 ; GFX10-64-NEXT: .LBB7_5: ; %.continue0 ; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] +; GFX10-64-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_mov_b64 s[4:5], exec +; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] ; GFX10-64-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; GFX10-64-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX10-64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10-64-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[6:7], -1 +; GFX10-64-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX10-64-NEXT: s_cmov_b64 exec, s[6:7] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10-64-NEXT: ; %bb.6: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -1126,9 +1154,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: s_branch .LBB7_4 ; GFX10-64-NEXT: .LBB7_8: ; %.return -; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll index de9af5209db16c..22f1277795fbd2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll @@ -8,15 +8,11 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src ; LOOP-LABEL: memmove_p1i8: ; LOOP: ; %bb.0: ; LOOP-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1] -; LOOP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; LOOP-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; LOOP-NEXT: s_cbranch_execnz .LBB0_3 -; LOOP-NEXT: ; %bb.1: ; %Flow -; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; LOOP-NEXT: s_cbranch_execnz .LBB0_4 -; LOOP-NEXT: .LBB0_2: ; %memmove_done -; LOOP-NEXT: s_endpgm -; LOOP-NEXT: .LBB0_3: +; LOOP-NEXT: s_xor_b64 s[0:1], vcc, exec +; LOOP-NEXT: s_and_b64 s[2:3], vcc, -1 +; LOOP-NEXT: s_cmov_b64 exec, vcc +; LOOP-NEXT: s_cbranch_scc0 .LBB0_2 +; LOOP-NEXT: ; %bb.1: ; LOOP-NEXT: s_mov_b32 s6, 0 ; LOOP-NEXT: s_mov_b32 s7, 0xf000 ; LOOP-NEXT: s_mov_b64 s[4:5], 0 @@ -44,9 +40,13 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src ; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[4:7], 0 addr64 offset:3 ; LOOP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; LOOP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; LOOP-NEXT: s_cbranch_execz .LBB0_2 -; LOOP-NEXT: .LBB0_4: ; %memmove_bwd_residual +; LOOP-NEXT: s_or_b64 exec, exec, s[0:1] +; LOOP-NEXT: .LBB0_2: ; %Flow +; LOOP-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; LOOP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; LOOP-NEXT: s_cmov_b64 exec, s[0:1] +; LOOP-NEXT: s_cbranch_scc0 .LBB0_4 +; LOOP-NEXT: ; %bb.3: ; %memmove_bwd_residual ; LOOP-NEXT: s_mov_b32 s2, 0 ; LOOP-NEXT: s_mov_b32 s3, 0xf000 ; LOOP-NEXT: s_mov_b64 s[0:1], 0 @@ -74,6 +74,7 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src ; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:1 ; LOOP-NEXT: buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:2 ; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3 +; LOOP-NEXT: .LBB0_4: ; %memmove_done ; LOOP-NEXT: s_endpgm ; ; UNROLL-LABEL: memmove_p1i8: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 646cb48d37367b..9240d5de2174fe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -171,16 +171,12 @@ define void @localize_internal_globals(i1 %cond) { ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB2_3 -; GFX9-NEXT: ; %bb.1: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB2_4 -; GFX9-NEXT: .LBB2_2: ; %bb2 -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB2_3: ; %bb1 +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; %bb1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, static.gv2@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv2@rel32@hi+12 @@ -193,22 +189,27 @@ define void @localize_internal_globals(i1 %cond) { ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: .LBB2_4: ; %bb0 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, static.gv0@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, static.gv0@rel32@hi+12 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB2_2: ; %Flow +; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; %bb0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, static.gv0@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, static.gv0@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_store_dword v0, v0, s[6:7] +; GFX9-NEXT: global_store_dword v0, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, static.gv1@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, static.gv1@rel32@hi+12 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, static.gv1@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, static.gv1@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB2_4: ; %bb2 ; GFX9-NEXT: s_setpc_b64 s[30:31] entry: br i1 %cond, label %bb0, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 489f46d1237a36..cb4cfaa9cabaf6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -524,24 +524,28 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB10_2 +; GFX10-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX10-NEXT: ; %bb.1: ; %else ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, v2, v4, 0 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, v2, v5, v[1:2] ; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: .LBB10_2: ; %Flow -; GFX10-NEXT: s_andn2_saveexec_b32 s0, s0 -; GFX10-NEXT: s_cbranch_execz .LBB10_4 +; GFX10-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX10-NEXT: ; %bb.3: ; %if ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v2, v5 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: .LBB10_4: ; %endif -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm @@ -558,12 +562,13 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[2:3], v0, s[6:7] ; GFX11-NEXT: global_load_b64 v[4:5], v0, s[0:1] -; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3] -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 @@ -572,15 +577,19 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_mov_b32_e32 v1, v3 ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: .LBB10_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cmov_b32 exec_lo, s0 +; GFX11-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX11-NEXT: ; %bb.3: ; %if ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: .LBB10_4: ; %endif -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index c7afbeabbbb6b1..6ae71660ee335a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -148,37 +148,43 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; GCN-LABEL: func_non_entry_block_static_alloca_align4: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, s33 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_mov_b32 s9, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB2_3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB2_4 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN-NEXT: s_and_b64 exec, exec, vcc -; GCN-NEXT: s_cbranch_execz .LBB2_3 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB2_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_add_u32 s6, s32, 0x1000 +; GCN-NEXT: s_add_u32 s8, s32, 0x1000 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s8 ; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, 1 ; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v4 -; GCN-NEXT: v_add_u32_e32 v2, s6, v2 +; GCN-NEXT: v_add_u32_e32 v2, s8, v2 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: .LBB2_3: ; %bb.2 +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: .LBB2_3: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB2_4: ; %bb.2 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s7 +; GCN-NEXT: s_mov_b32 s33, s9 ; GCN-NEXT: s_setpc_b64 s[30:31] entry: @@ -213,11 +219,13 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0xfc0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000 ; GCN-NEXT: s_addk_i32 s32, 0x2000 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_add_u32 s6, s32, 0x1000 ; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 @@ -233,8 +241,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: .LBB3_2: ; %bb.1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB3_2: ; %bb.1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir index 0ef5aaea3b1497..f48fe5c4f414b7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir @@ -86,14 +86,12 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY2]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .3: ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .4: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr0 @@ -142,14 +140,12 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY1]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .3: ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .4: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.32.mir index a0711e6c779cd9..30a589fe571929 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.32.mir @@ -15,8 +15,8 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[INT:%[0-9]+]]:vcc(s1), [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), [[COPY]](s32) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vcc(s1), [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), [[COPY]](s32) %0:_(s32) = COPY $sgpr0 - %1:_(s1), %2:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %0 + %1:_(s1), %2:_(s32) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %0 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.64.mir index 8a2cbd0eafd6cd..0b62155da49671 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.64.mir @@ -12,8 +12,8 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[INT:%[0-9]+]]:vcc(s1), [[INT1:%[0-9]+]]:sgpr(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), [[COPY]](s64) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vcc(s1), [[INT1:%[0-9]+]]:sgpr(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), [[COPY]](s64) %0:_(s64) = COPY $sgpr0_sgpr1 - %1:_(s1), %2:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %0 + %1:_(s1), %2:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %0 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll index 61263e0efa2ea1..586142193618ff 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll @@ -147,14 +147,12 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; FAST-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.4: ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.5: ; FAST-NEXT: [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) @@ -209,14 +207,12 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GREEDY-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) @@ -278,14 +274,12 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; FAST-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.4: ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.5: ; FAST-NEXT: [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) @@ -341,14 +335,12 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GREEDY-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll index d6a7ae8d867fe8..811c51dd91c649 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll @@ -168,14 +168,12 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; FAST-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.4: ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.5: ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) ; FAST-NEXT: S_ENDPGM 0 @@ -234,14 +232,12 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GREEDY-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 @@ -298,14 +294,12 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; FAST-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.4: ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.5: ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) ; FAST-NEXT: S_ENDPGM 0 @@ -356,14 +350,12 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GREEDY-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 @@ -440,14 +432,12 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; FAST-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.4: ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.5: ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) ; FAST-NEXT: S_ENDPGM 0 @@ -518,14 +508,12 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GREEDY-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll index 8c7bdb867d1681..1721aa5d9fc0d9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll @@ -88,14 +88,12 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -135,14 +133,12 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -194,14 +190,12 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll index 19793f7020dc84..16b674073fe9cb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll @@ -88,14 +88,12 @@ define amdgpu_ps float @raw_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -135,14 +133,12 @@ define amdgpu_ps float @raw_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -194,14 +190,12 @@ define amdgpu_ps float @raw_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll index f426fb8954ed26..feb2cdc0a079c8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -1640,11 +1640,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX7-NEXT: successors: %bb.4, %bb.2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1686,11 +1685,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX12-NEXT: successors: %bb.4, %bb.2 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1738,11 +1736,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX7-NEXT: successors: %bb.4, %bb.2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1785,11 +1782,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX12-NEXT: successors: %bb.4, %bb.2 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1839,11 +1835,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX7-NEXT: successors: %bb.4, %bb.2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1886,11 +1881,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX12-NEXT: successors: %bb.4, %bb.2 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4096, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1938,11 +1932,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX7-NEXT: successors: %bb.4, %bb.2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1984,11 +1977,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX12-NEXT: successors: %bb.4, %bb.2 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -2035,11 +2027,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX7-NEXT: successors: %bb.4, %bb.2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -2081,11 +2072,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX12-NEXT: successors: %bb.4, %bb.2 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4096, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4096) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -2135,11 +2125,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2192,11 +2181,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2257,11 +2245,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2314,11 +2301,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4068, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4084, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2377,11 +2363,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2434,11 +2419,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4096, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4112, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2496,11 +2480,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2554,11 +2537,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 5000, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 5016, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2616,11 +2598,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2674,11 +2655,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4076, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2736,11 +2716,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2794,11 +2773,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4096, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2855,11 +2833,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2911,11 +2888,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll index f664e62761ad59..90aefb274ab51f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll @@ -86,14 +86,12 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[COPY6]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -132,14 +130,12 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgp ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -190,14 +186,12 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll index 0f72586ed6c127..4ecb84c3bf3085 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll @@ -88,14 +88,12 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[COPY7]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -134,14 +132,12 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -192,14 +188,12 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll index b835b3a3e380b8..cdc664f624653a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll @@ -86,14 +86,12 @@ define amdgpu_ps float @struct_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[COPY6]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -132,14 +130,12 @@ define amdgpu_ps float @struct_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -190,14 +186,12 @@ define amdgpu_ps float @struct_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll index 0cefc373dd7cd8..61adc934bba942 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll @@ -88,14 +88,12 @@ define amdgpu_ps void @struct_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[COPY7]], 0, 0, -1 :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -134,14 +132,12 @@ define amdgpu_ps void @struct_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -192,14 +188,12 @@ define amdgpu_ps void @struct_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir index 23383f27efce71..b85e9d49779a47 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir @@ -40,14 +40,12 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE %val(s32), %rsrc(<4 x s32>), [[COPY]](s32), %voffset, [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable store (s32), addrspace 4) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .3: ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .4: ; CHECK-NEXT: S_ENDPGM 0 %val:_(s32) = COPY $vgpr0 @@ -108,14 +106,12 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY1]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>)) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .3: ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .4: ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) %0:_(<8 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 404e726246f4d2..9eb4d4f2f90a20 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -14,16 +14,11 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB0_4 -; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 ; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc @@ -159,9 +154,13 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB0_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -182,7 +181,8 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, %den ret i64 %result @@ -654,11 +654,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_mov_b32_e32 v8, v2 +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec ; CGP-NEXT: v_mov_b32_e32 v9, v3 +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_2 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 @@ -793,9 +794,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB2_2: ; %Flow1 -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_4 +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 @@ -817,22 +821,17 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] ; CGP-NEXT: .LBB2_4: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB2_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB2_8 -; CGP-NEXT: .LBB2_6: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB2_7: +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_6 +; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 ; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v2, vcc @@ -966,9 +965,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_6 -; CGP-NEXT: .LBB2_8: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB2_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_8 +; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -989,7 +992,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] +; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, %den ret <2 x i64> %result @@ -1661,16 +1665,11 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB7_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB7_4 -; CHECK-NEXT: .LBB7_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB7_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 ; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v6, v0, vcc @@ -1804,9 +1803,13 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 -; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB7_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB7_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -1827,7 +1830,8 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y %r = sdiv i64 %x, %shl.y @@ -2113,23 +2117,24 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_mov_b32_e32 v5, v2 -; CGP-NEXT: v_mov_b32_e32 v7, v3 +; CGP-NEXT: v_mov_b32_e32 v9, v3 ; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: v_lshl_b64 v[11:12], v[2:3], v4 -; CGP-NEXT: v_mov_b32_e32 v9, v1 -; CGP-NEXT: v_mov_b32_e32 v8, v0 -; CGP-NEXT: v_or_b32_e32 v1, v9, v12 +; CGP-NEXT: v_lshl_b64 v[12:13], v[2:3], v4 +; CGP-NEXT: v_mov_b32_e32 v8, v1 +; CGP-NEXT: v_mov_b32_e32 v7, v0 +; CGP-NEXT: v_or_b32_e32 v1, v8, v13 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_2 +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v12 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v0 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v12, v0, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v13 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v0 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v13, v0, vcc ; CGP-NEXT: v_xor_b32_e32 v4, v1, v0 ; CGP-NEXT: v_xor_b32_e32 v1, v10, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 @@ -2172,276 +2177,275 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc ; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v8 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 ; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v8, v14 -; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v7, v14 +; CGP-NEXT: v_mul_lo_u32 v7, v16, v10 ; CGP-NEXT: v_mul_lo_u32 v15, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v17, v9, v14 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v10 +; CGP-NEXT: v_xor_b32_e32 v17, v8, v14 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v16, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v8, v16, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7 ; CGP-NEXT: v_mul_hi_u32 v15, v13, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v17, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v12, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v17, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v17, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v17, v7 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v8 +; CGP-NEXT: v_mul_hi_u32 v13, v12, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v17, v7 +; CGP-NEXT: v_mul_hi_u32 v15, v17, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v11, v12, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v12, v8 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v7, v10 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, 0 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v10 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10] -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v17, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v9 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v15, v[8:9] +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v13, v[10:11] +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v17, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v17, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 +; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v7, v4 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, v12, v15, s[4:5] -; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v1 +; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc +; CGP-NEXT: v_cndmask_b32_e64 v8, v11, v12, s[4:5] +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v13 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v15, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v1 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v1 ; CGP-NEXT: v_cndmask_b32_e32 v1, v16, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v8, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v7, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v8, v14, v0 -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v1, v8 -; CGP-NEXT: v_xor_b32_e32 v1, v4, v8 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: v_xor_b32_e32 v7, v14, v0 +; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v1, v7 +; CGP-NEXT: v_xor_b32_e32 v1, v4, v7 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; CGP-NEXT: ; implicit-def: $vgpr12_vgpr13 +; CGP-NEXT: ; implicit-def: $vgpr7 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB8_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: v_lshl_b64 v[9:10], v[2:3], v6 -; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_4 +; CGP-NEXT: v_lshl_b64 v[10:11], v[2:3], v6 +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_4 ; CGP-NEXT: ; %bb.3: -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v11 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v11 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v12 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v12 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v11 +; CGP-NEXT: v_mul_hi_u32 v0, v7, v0 +; CGP-NEXT: v_mul_lo_u32 v1, v0, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v11 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v7, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v12 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v11 +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v12 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v11 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v12 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] ; CGP-NEXT: .LBB8_4: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_or_b32_e32 v3, v7, v10 +; CGP-NEXT: v_or_b32_e32 v3, v9, v11 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB8_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB8_8 -; CGP-NEXT: .LBB8_6: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB8_7: -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v10, v2, vcc +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_6 +; CGP-NEXT: ; %bb.5: +; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v11 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v2 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v11, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v4, v3, v2 ; CGP-NEXT: v_xor_b32_e32 v3, v6, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 +; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; CGP-NEXT: v_trunc_f32_e32 v10, v8 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v6, v14, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v16, v14, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 +; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; CGP-NEXT: v_trunc_f32_e32 v8, v7 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v8 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[7:8] +; CGP-NEXT: v_mul_hi_u32 v14, v10, v6 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[7:8] +; CGP-NEXT: v_mul_lo_u32 v8, v13, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v7 +; CGP-NEXT: v_mul_lo_u32 v16, v13, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v7 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v16, v6 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v6 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[7:8] +; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc +; CGP-NEXT: v_xor_b32_e32 v9, v5, v11 +; CGP-NEXT: v_mul_lo_u32 v5, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v7 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v5, v12 -; CGP-NEXT: v_mul_lo_u32 v5, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v14, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v11, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v9 +; CGP-NEXT: v_mul_lo_u32 v14, v13, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v10, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v10, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v10, v9, v6 +; CGP-NEXT: v_mul_hi_u32 v12, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v10, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0 +; CGP-NEXT: v_mul_lo_u32 v12, v8, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 ; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 ; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5] -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc +; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5] +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v12, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v11, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc -; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; CGP-NEXT: ; implicit-def: $vgpr5 -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_6 -; CGP-NEXT: .LBB8_8: -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB8_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_8 +; CGP-NEXT: ; %bb.7: +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -2449,18 +2453,19 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v9 +; CGP-NEXT: v_mul_lo_u32 v3, v2, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v9 +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v10 ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] +; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y %r = sdiv <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 5b94e71ecf52e2..3e3775bb74bc52 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -14,16 +14,11 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB0_4 -; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v3 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v2, v1 ; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v1, vcc @@ -155,9 +150,13 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB0_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -177,6 +176,7 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, %den ret i64 %result @@ -640,11 +640,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_mov_b32_e32 v8, v2 +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec ; CGP-NEXT: v_mov_b32_e32 v9, v3 +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_2 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v1 @@ -777,9 +778,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB2_2: ; %Flow1 -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_4 +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 @@ -799,22 +803,17 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 -; CGP-NEXT: .LBB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB2_4: ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB2_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB2_8 -; CGP-NEXT: .LBB2_6: -; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB2_7: +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_6 +; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v7 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v3 ; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v3, vcc @@ -946,9 +945,13 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_6 -; CGP-NEXT: .LBB2_8: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB2_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_8 +; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -968,6 +971,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, %den ret <2 x i64> %result @@ -2176,16 +2180,11 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB7_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB7_4 -; CHECK-NEXT: .LBB7_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB7_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v1 ; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v1, vcc @@ -2319,9 +2318,13 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 -; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB7_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB7_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2341,6 +2344,7 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y %r = srem i64 %x, %shl.y @@ -2622,23 +2626,24 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_mov_b32_e32 v5, v2 -; CGP-NEXT: v_mov_b32_e32 v7, v3 +; CGP-NEXT: v_mov_b32_e32 v9, v3 ; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: v_lshl_b64 v[11:12], v[2:3], v4 -; CGP-NEXT: v_mov_b32_e32 v9, v1 -; CGP-NEXT: v_mov_b32_e32 v8, v0 -; CGP-NEXT: v_or_b32_e32 v1, v9, v12 +; CGP-NEXT: v_lshl_b64 v[12:13], v[2:3], v4 +; CGP-NEXT: v_mov_b32_e32 v8, v1 +; CGP-NEXT: v_mov_b32_e32 v7, v0 +; CGP-NEXT: v_or_b32_e32 v1, v8, v13 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_2 +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v12 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v11, v1 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v12, v1, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v13 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v1 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v13, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v1 ; CGP-NEXT: v_xor_b32_e32 v1, v4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 @@ -2683,78 +2688,78 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v11 ; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v14 +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v14 ; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v8, v14, vcc ; CGP-NEXT: v_xor_b32_e32 v12, v4, v14 ; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v9, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v15, v8, v14 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v10 +; CGP-NEXT: v_mul_lo_u32 v8, v13, v11 +; CGP-NEXT: v_xor_b32_e32 v15, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 +; CGP-NEXT: v_mul_lo_u32 v7, v16, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v15, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v12, v8 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v16, v7, vcc +; CGP-NEXT: v_mul_lo_u32 v8, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v12, v7 ; CGP-NEXT: v_mul_hi_u32 v11, v12, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v15, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v15, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v15, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v13, 0 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v4 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v15, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v15, v9 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v4 +; CGP-NEXT: v_mov_b32_e32 v4, v8 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v10, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v7 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v13, v[10:11] +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v15, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v15, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v1 -; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 +; CGP-NEXT: v_subb_u32_e32 v8, vcc, v8, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v11, vcc, v4, v0 -; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v9, vcc +; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v8, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v0 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v1 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v1, vcc +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -2763,156 +2768,153 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v14 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v14 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc -; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: ; implicit-def: $vgpr12_vgpr13 +; CGP-NEXT: ; implicit-def: $vgpr7 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB8_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: v_lshl_b64 v[9:10], v[2:3], v6 -; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_4 +; CGP-NEXT: v_lshl_b64 v[10:11], v[2:3], v6 +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_4 ; CGP-NEXT: ; %bb.3: -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v11 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v11 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v12 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v12 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v11 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v11 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v11 +; CGP-NEXT: v_mul_hi_u32 v0, v7, v0 +; CGP-NEXT: v_mul_lo_u32 v0, v0, v12 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v7, v0 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v12 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v11 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v11 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v12 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 -; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_or_b32_e32 v3, v7, v10 +; CGP-NEXT: .LBB8_4: +; CGP-NEXT: v_or_b32_e32 v3, v9, v11 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB8_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB8_8 -; CGP-NEXT: .LBB8_6: -; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB8_7: -; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v10 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v9, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v3, vcc +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_6 +; CGP-NEXT: ; %bb.5: +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v11 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v2, v3 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v6, v6 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 +; CGP-NEXT: v_trunc_f32_e32 v8, v6 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v8 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v13, v6 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[7:8] +; CGP-NEXT: v_mul_hi_u32 v8, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v14, v10, v7 +; CGP-NEXT: v_mul_lo_u32 v15, v13, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v10, v7 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v4, v12 -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v4 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v11 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v9, v11, vcc +; CGP-NEXT: v_xor_b32_e32 v8, v4, v11 +; CGP-NEXT: v_mul_lo_u32 v4, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v10, v7 +; CGP-NEXT: v_xor_b32_e32 v12, v5, v11 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 +; CGP-NEXT: v_mul_lo_u32 v5, v13, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v10, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v13, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v12, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_mul_hi_u32 v7, v8, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v12, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 @@ -2925,11 +2927,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc @@ -2937,17 +2939,21 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc -; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v11 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v11 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; CGP-NEXT: ; implicit-def: $vgpr5 -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_6 -; CGP-NEXT: .LBB8_8: -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB8_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_8 +; CGP-NEXT: ; %bb.7: +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -2955,16 +2961,17 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v2, v2, v9 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v10 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v9 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v9 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y %r = srem <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index e31d8e95bd6084..d3ef40200efbcf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -13,18 +13,13 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB0_4 -; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v3 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc @@ -152,9 +147,13 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: ; implicit-def: $vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB0_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v6 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -174,7 +173,8 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %num, %den ret i64 %result @@ -627,11 +627,12 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_or_b32_e32 v1, v11, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v4 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_2 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 @@ -760,9 +761,12 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr2 ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB2_2: ; %Flow1 -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_4 +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v2 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 @@ -783,23 +787,18 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] ; CGP-NEXT: .LBB2_4: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6 ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB2_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB2_8 -; CGP-NEXT: .LBB2_6: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB2_7: +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_6 +; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v7 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc @@ -927,9 +926,13 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_6 -; CGP-NEXT: .LBB2_8: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB2_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_8 +; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 @@ -949,7 +952,8 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] +; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i64> %num, %den ret <2 x i64> %result @@ -1072,22 +1076,17 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_mov_b32_e32 v4, v1 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_mov_b32_e32 v7, 0 ; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2 -; CHECK-NEXT: v_or_b32_e32 v8, v4, v6 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] +; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v5 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB7_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB7_4 -; CHECK-NEXT: .LBB7_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB7_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v6 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v6, vcc @@ -1215,9 +1214,13 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 -; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB7_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB7_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -1237,7 +1240,8 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y %r = udiv i64 %x, %shl.y @@ -1513,15 +1517,16 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mov_b32_e32 v7, v3 ; CGP-NEXT: v_mov_b32_e32 v10, 0x1000 ; CGP-NEXT: v_mov_b32_e32 v11, 0 -; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_lshl_b64 v[2:3], v[10:11], v4 ; CGP-NEXT: v_or_b32_e32 v1, v9, v3 +; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_2 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 @@ -1650,11 +1655,13 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB8_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 ; CGP-NEXT: v_lshl_b64 v[9:10], v[10:11], v6 -; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_4 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 @@ -1675,23 +1682,18 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] ; CGP-NEXT: .LBB8_4: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v7, v10 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v9 ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB8_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB8_8 -; CGP-NEXT: .LBB8_6: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB8_7: +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_6 +; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v10, vcc @@ -1819,9 +1821,13 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_6 -; CGP-NEXT: .LBB8_8: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB8_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_8 +; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 @@ -1841,7 +1847,8 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] +; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y %r = udiv <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index f30b278b3e611c..805686057b8a8e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -13,18 +13,13 @@ define i64 @v_urem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB0_4 -; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v3 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc @@ -151,9 +146,13 @@ define i64 @v_urem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: ; implicit-def: $vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB0_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v6 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -172,6 +171,7 @@ define i64 @v_urem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i64 %num, %den ret i64 %result @@ -619,11 +619,12 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_or_b32_e32 v1, v11, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v4 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_2 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 @@ -751,9 +752,12 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr2 ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB2_2: ; %Flow1 -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_4 +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v2 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 @@ -772,23 +776,18 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 -; CGP-NEXT: .LBB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB2_4: ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6 ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB2_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB2_8 -; CGP-NEXT: .LBB2_6: -; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB2_7: +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_6 +; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v7 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc @@ -915,9 +914,13 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_6 -; CGP-NEXT: .LBB2_8: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB2_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_8 +; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 @@ -936,6 +939,7 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, %den ret <2 x i64> %result @@ -1501,22 +1505,17 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_mov_b32_e32 v4, v1 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_mov_b32_e32 v7, 0 ; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2 -; CHECK-NEXT: v_or_b32_e32 v8, v4, v6 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] +; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v5 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB7_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB7_4 -; CHECK-NEXT: .LBB7_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB7_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v6 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v6, vcc @@ -1643,9 +1642,13 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 -; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB7_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB7_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -1664,6 +1667,7 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y %r = urem i64 %x, %shl.y @@ -1937,15 +1941,16 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mov_b32_e32 v7, v3 ; CGP-NEXT: v_mov_b32_e32 v10, 0x1000 ; CGP-NEXT: v_mov_b32_e32 v11, 0 -; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_lshl_b64 v[2:3], v[10:11], v4 ; CGP-NEXT: v_or_b32_e32 v1, v9, v3 +; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_2 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 @@ -2073,11 +2078,13 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB8_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; CGP-NEXT: v_lshl_b64 v[9:10], v[10:11], v6 -; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_4 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 @@ -2096,23 +2103,18 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 -; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB8_4: ; CGP-NEXT: v_or_b32_e32 v3, v7, v10 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v9 ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB8_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB8_8 -; CGP-NEXT: .LBB8_6: -; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB8_7: +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_6 +; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v10, vcc @@ -2239,9 +2241,13 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_6 -; CGP-NEXT: .LBB8_8: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB8_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_8 +; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 @@ -2260,6 +2266,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y %r = urem <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll index 1f1c2659e81103..2b46a69548fe18 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll @@ -8,35 +8,37 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 8 -; GFX906-NEXT: v_mov_b32_e32 v5, 16 +; GFX906-NEXT: v_mov_b32_e32 v4, 16 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v4, v2, s[4:5] -; GFX906-NEXT: v_mov_b32_e32 v1, 0xff +; GFX906-NEXT: global_load_dword v5, v2, s[4:5] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec +; GFX906-NEXT: v_mov_b32_e32 v1, 0xff +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v4 -; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX906-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX906-NEXT: v_or3_b32 v4, v6, v7, v4 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB0_2 +; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v5 +; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX906-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX906-NEXT: v_or3_b32 v0, v6, v7, v5 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v0, v2, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX906-NEXT: v_or3_b32 v4, v2, v3, v0 +; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX906-NEXT: v_or3_b32 v0, v2, v3, v0 +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB0_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4 -; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX906-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: global_store_short v1, v0, s[0:1] ; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[0:1] offset:2 @@ -65,14 +67,16 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dword v1, v2, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB1_2 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v1, v2, s[6:7] +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB1_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dword v0, v1, s[0:1] @@ -101,18 +105,20 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB2_2 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB2_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v1 @@ -146,14 +152,16 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB3_2 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB3_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] @@ -182,14 +190,16 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB4_2 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[6:7] +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB4_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] @@ -218,16 +228,18 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v9, 5, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[4:5] ; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[4:5] offset:16 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB5_2 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[6:7] ; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[6:7] offset:16 +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB5_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] @@ -266,6 +278,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_add_u32 s12, s12, s9 ; GFX906-NEXT: s_addc_u32 s13, s13, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -288,8 +302,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[4:5] offset:208 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[4:5] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] offset:240 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB6_2 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -313,8 +327,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[6:7] offset:208 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[6:7] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] offset:240 +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB6_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -482,27 +496,31 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_xor_b64 s[0:1], vcc, -1 +; GFX906-NEXT: s_xor_b64 s[2:3], vcc, -1 +; GFX906-NEXT: s_mov_b64 s[0:1], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB8_2 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 -; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX906-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc -; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX906-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: .LBB8_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX906-NEXT: s_cbranch_execz .LBB8_4 +; GFX906-NEXT: s_mov_b64 s[0:1], exec +; GFX906-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX906-NEXT: s_cmov_b64 exec, s[2:3] +; GFX906-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX906-NEXT: ; %bb.3: ; %bb.2 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: .LBB8_4: ; %bb.3 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11] @@ -536,25 +554,29 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac ; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[0:1], exec +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, v3 ; GFX906-NEXT: v_mov_b32_e32 v2, v4 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB9_4 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[6:7] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB9_3 +; GFX906-NEXT: s_mov_b64 s[2:3], exec +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX906-NEXT: ; %bb.2: ; %bb.2 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9] -; GFX906-NEXT: .LBB9_3: ; %Flow ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: .LBB9_4: ; %bb.3 +; GFX906-NEXT: .LBB9_3: ; %Flow ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: .LBB9_4: ; %bb.3 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11] @@ -598,16 +620,16 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp ; GFX906-NEXT: v_mov_b32_e32 v2, 24 ; GFX906-NEXT: .LBB10_1: ; %bb.1 ; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc +; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX906-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX906-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GFX906-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX906-NEXT: v_or3_b32 v1, v0, v3, v1 -; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_cbranch_execnz .LBB10_1 +; GFX906-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX906-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll index e03c9ca34b825a..8bf0a5a3905846 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll @@ -13,31 +13,39 @@ define amdgpu_ps void @main(i32 %arg) { ; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_branch .LBB0_2 -; GFX10-NEXT: .LBB0_1: ; in Loop: Header=BB0_2 Depth=1 +; GFX10-NEXT: s_branch .LBB0_3 +; GFX10-NEXT: .LBB0_1: ; %Flow +; GFX10-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: .LBB0_2: ; in Loop: Header=BB0_3 Depth=1 ; GFX10-NEXT: s_and_b32 s0, exec_lo, vcc_lo ; GFX10-NEXT: s_or_b32 s2, s0, s2 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execz .LBB0_5 -; GFX10-NEXT: .LBB0_2: ; %bb4 +; GFX10-NEXT: s_andn2_b32 s0, exec_lo, s2 +; GFX10-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX10-NEXT: s_cbranch_scc0 .LBB0_6 +; GFX10-NEXT: .LBB0_3: ; %bb4 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_and_saveexec_b32 s3, s1 -; GFX10-NEXT: s_cbranch_execz .LBB0_1 -; GFX10-NEXT: ; %bb.3: ; in Loop: Header=BB0_2 Depth=1 +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_and_b32 s0, s1, exec_lo +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB0_2 +; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB0_3 Depth=1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10-NEXT: s_mov_b32 s8, exec_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB0_1 -; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB0_2 Depth=1 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB0_1 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB0_3 Depth=1 ; GFX10-NEXT: s_mov_b32 s5, s4 ; GFX10-NEXT: s_mov_b32 s6, s4 ; GFX10-NEXT: s_mov_b32 s7, s4 ; GFX10-NEXT: buffer_atomic_and v0, off, s[4:7], 0 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_branch .LBB0_1 -; GFX10-NEXT: .LBB0_5: ; %bb8 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: .LBB0_6: ; %bb8 ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 17fe3adc221692..6372b1f52f090e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -20,12 +20,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB0_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -33,8 +35,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -51,9 +53,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -61,8 +65,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -79,9 +83,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -89,8 +95,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -103,12 +109,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-LABEL: add_i32_constant: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -116,9 +124,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -132,11 +140,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -144,9 +154,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -165,8 +175,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -175,8 +187,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -196,8 +208,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -206,8 +220,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -228,8 +242,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -238,8 +254,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -259,8 +275,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -269,8 +287,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -292,13 +310,15 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB1_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -306,8 +326,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -321,14 +341,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -336,8 +358,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -351,14 +373,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -366,8 +390,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -382,12 +406,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -395,9 +421,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -412,11 +438,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -424,9 +452,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -446,8 +474,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -456,8 +486,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -478,8 +508,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -488,8 +520,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -511,8 +543,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -521,8 +555,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -543,8 +577,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -553,8 +589,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -594,17 +630,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX6-NEXT: s_cbranch_execz .LBB2_4 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX6-NEXT: ; %bb.3: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB2_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB2_4: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -636,17 +673,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -677,17 +715,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -717,17 +756,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -757,17 +797,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -802,17 +843,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -845,19 +886,20 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -893,17 +935,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX12W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -936,19 +978,20 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX12W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -988,10 +1031,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX6-NEXT: s_cbranch_execz .LBB3_4 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX6-NEXT: ; %bb.3: ; GFX6-NEXT: s_load_dword s5, s[2:3], 0x11 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd @@ -999,8 +1043,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX6-NEXT: .LBB3_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB3_4: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1032,10 +1076,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB3_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dword s5, s[2:3], 0x44 ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 @@ -1043,8 +1088,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB3_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB3_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -1075,10 +1120,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x44 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 @@ -1086,8 +1132,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -1117,9 +1163,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX10W64-NEXT: s_cbranch_execz .LBB3_4 +; GFX10W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_clause 0x1 ; GFX10W64-NEXT: s_load_dword s5, s[2:3], 0x44 @@ -1128,9 +1175,9 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mov_b32_e32 v2, s5 ; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB3_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB3_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -1160,9 +1207,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10W32-NEXT: s_cbranch_execz .LBB3_4 +; GFX10W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_clause 0x1 ; GFX10W32-NEXT: s_load_dword s8, s[2:3], 0x44 @@ -1171,9 +1219,9 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mov_b32_e32 v2, s8 ; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB3_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: .LBB3_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -1208,10 +1256,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX11W64-NEXT: s_cbranch_execz .LBB3_4 +; GFX11W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_clause 0x1 ; GFX11W64-NEXT: s_load_b32 s5, s[2:3], 0x44 @@ -1220,8 +1268,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mov_b32_e32 v2, s5 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB3_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB3_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1254,12 +1302,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX11W32-NEXT: s_cbranch_execz .LBB3_4 +; GFX11W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_clause 0x1 ; GFX11W32-NEXT: s_load_b32 s8, s[2:3], 0x44 @@ -1267,8 +1316,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB3_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: .LBB3_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1304,10 +1353,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX12W64-NEXT: s_cbranch_execz .LBB3_4 +; GFX12W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_clause 0x1 ; GFX12W64-NEXT: s_load_b32 s5, s[2:3], 0x44 @@ -1316,8 +1365,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, s5 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB3_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB3_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1350,12 +1399,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX12W32-NEXT: s_cbranch_execz .LBB3_4 +; GFX12W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_clause 0x1 ; GFX12W32-NEXT: s_load_b32 s8, s[2:3], 0x44 @@ -1363,8 +1413,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB3_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: .LBB3_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1506,12 +1556,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB5_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1519,8 +1571,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1538,9 +1590,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB5_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1548,8 +1602,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -1567,9 +1621,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1577,8 +1633,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -1592,12 +1648,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-LABEL: sub_i32_constant: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1605,9 +1663,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -1622,11 +1680,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -1634,9 +1694,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -1656,8 +1716,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1666,8 +1728,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1688,8 +1750,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -1698,8 +1762,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1721,8 +1785,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1731,8 +1797,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1753,8 +1819,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -1763,8 +1831,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1787,13 +1855,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB6_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1801,8 +1871,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB6_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB6_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1816,14 +1886,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB6_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1831,8 +1903,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1846,14 +1918,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1861,8 +1935,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1877,12 +1951,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1890,9 +1966,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1907,11 +1983,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1919,9 +1997,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -1941,8 +2019,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1951,8 +2031,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB6_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB6_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1974,8 +2054,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1984,8 +2066,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W32-NEXT: .LBB6_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: .LBB6_2: ; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -2008,8 +2090,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -2018,8 +2102,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB6_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB6_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -2041,8 +2125,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -2051,8 +2137,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB6_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: .LBB6_2: ; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -2093,17 +2179,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX6-NEXT: s_cbranch_execz .LBB7_4 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX6-NEXT: ; %bb.3: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB7_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB7_4: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -2135,17 +2222,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB7_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -2176,17 +2264,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -2216,17 +2305,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -2256,17 +2346,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -2301,17 +2392,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -2344,19 +2435,20 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB7_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: .LBB7_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -2393,17 +2485,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX12W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB7_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB7_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -2436,19 +2528,20 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX12W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB7_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: .LBB7_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 16f3ff4be6b501..3d974167b63ebc 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -26,13 +26,15 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -45,8 +47,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB0_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: .LBB0_2: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -57,14 +59,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: add_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s8, s2 @@ -77,8 +81,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -89,14 +93,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 @@ -109,8 +115,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 @@ -123,12 +129,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 @@ -142,9 +150,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 @@ -157,11 +165,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 @@ -175,9 +185,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 @@ -195,8 +205,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 @@ -210,8 +222,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB0_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB0_2: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -231,8 +243,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -246,8 +260,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -268,8 +282,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1264-NEXT: s_cbranch_execz .LBB0_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 @@ -283,8 +299,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB0_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264-NEXT: .LBB0_2: ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 @@ -304,8 +320,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1232-NEXT: s_cbranch_execz .LBB0_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 @@ -319,8 +337,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB0_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1232-NEXT: .LBB0_2: ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 @@ -341,14 +359,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX7LESS-NEXT: s_load_dword s2, s[2:3], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[8:9] @@ -361,8 +381,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB1_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: .LBB1_2: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 @@ -374,31 +394,33 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x34 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8-NEXT: s_bcnt1_i32_b64 s3, s[8:9] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 +; GFX8-NEXT: s_mul_i32 s3, s2, s3 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 @@ -408,31 +430,33 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_load_dword s10, s[2:3], 0x34 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 +; GFX9-NEXT: s_mul_i32 s2, s10, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s10, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -446,12 +470,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_load_dword s10, s[2:3], 0x34 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB1_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 @@ -465,9 +491,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 @@ -482,11 +508,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB1_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 @@ -500,9 +528,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 @@ -522,8 +550,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB1_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 @@ -537,8 +567,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB1_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: .LBB1_2: ; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 @@ -560,8 +590,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB1_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -575,8 +607,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB1_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: .LBB1_2: ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 @@ -599,8 +631,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1264-NEXT: s_cbranch_execz .LBB1_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 @@ -614,8 +648,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB1_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1264-NEXT: .LBB1_2: ; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 @@ -637,8 +671,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1232-NEXT: s_cbranch_execz .LBB1_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 @@ -652,8 +688,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB1_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1232-NEXT: .LBB1_2: ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 @@ -693,10 +729,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 @@ -707,8 +744,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 -; GFX7LESS_ITERATIVE-NEXT: .LBB2_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: .LBB2_4: ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -739,10 +776,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 @@ -753,8 +791,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol -; GFX8_ITERATIVE-NEXT: .LBB2_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: .LBB2_4: ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -784,10 +822,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: s_mov_b32 s15, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s14, -1 @@ -798,8 +837,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[12:15], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol -; GFX9_ITERATIVE-NEXT: .LBB2_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: .LBB2_4: ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: s_mov_b32 s7, 0xf000 @@ -829,9 +868,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -843,9 +883,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB2_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: .LBB2_4: ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 @@ -874,9 +914,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -888,9 +929,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB2_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: .LBB2_4: ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 @@ -924,10 +965,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -939,8 +980,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB2_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: .LBB2_4: ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -973,12 +1014,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -990,8 +1032,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB2_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: .LBB2_4: ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -1028,10 +1070,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1264_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1264_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -1043,8 +1085,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV -; GFX1264_ITERATIVE-NEXT: .LBB2_4: ; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264_ITERATIVE-NEXT: .LBB2_4: ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -1077,12 +1119,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, vcc_lo, exec_lo +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: ; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -1094,8 +1137,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV -; GFX1232_ITERATIVE-NEXT: .LBB2_4: ; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: .LBB2_4: ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -1128,16 +1171,17 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-LABEL: add_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1149,15 +1193,17 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s8, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: s_mov_b32 s6, s8 ; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s10, -1 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1167,8 +1213,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX8_DPP-NEXT: buffer_wbinvl1_vol -; GFX8_DPP-NEXT: .LBB2_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_DPP-NEXT: .LBB2_2: ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1181,16 +1227,17 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-LABEL: add_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1202,15 +1249,17 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s8, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: s_mov_b32 s2, s8 ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1220,8 +1269,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9_DPP-NEXT: buffer_wbinvl1_vol -; GFX9_DPP-NEXT: .LBB2_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB2_2: ; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1233,6 +1282,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064_DPP-LABEL: add_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -1249,30 +1299,31 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v1, 15 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s10, 16 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 47 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064_DPP-NEXT: s_mov_b32 s0, s9 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 @@ -1283,9 +1334,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl1_inv ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB2_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064_DPP-NEXT: .LBB2_2: ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1297,6 +1348,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032_DPP-LABEL: add_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -1311,22 +1363,25 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 @@ -1337,9 +1392,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB2_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032_DPP-NEXT: .LBB2_2: ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1352,6 +1407,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-LABEL: add_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec @@ -1378,30 +1434,29 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b32 s4, s9 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 @@ -1410,8 +1465,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl1_inv ; GFX1164_DPP-NEXT: buffer_gl0_inv -; GFX1164_DPP-NEXT: .LBB2_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164_DPP-NEXT: .LBB2_2: ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -1427,6 +1482,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-LABEL: add_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo @@ -1440,30 +1496,32 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 @@ -1474,8 +1532,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl1_inv ; GFX1132_DPP-NEXT: buffer_gl0_inv -; GFX1132_DPP-NEXT: .LBB2_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132_DPP-NEXT: .LBB2_2: ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -1491,6 +1549,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-LABEL: add_i32_varying: ; GFX1264_DPP: ; %bb.0: ; %entry ; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1264_DPP-NEXT: s_not_b64 exec, exec @@ -1517,30 +1576,29 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1264_DPP-NEXT: s_mov_b32 s4, s9 -; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1264_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1264_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1264_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1264_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1264_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1264_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1264_DPP-NEXT: ; %bb.1: -; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, s7 ; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 @@ -1549,8 +1607,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV -; GFX1264_DPP-NEXT: .LBB2_2: ; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264_DPP-NEXT: .LBB2_2: ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -1566,6 +1624,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-LABEL: add_i32_varying: ; GFX1232_DPP: ; %bb.0: ; %entry ; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo @@ -1579,30 +1638,32 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1232_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1232_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1232_DPP-NEXT: ; %bb.1: ; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 @@ -1613,8 +1674,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV -; GFX1232_DPP-NEXT: .LBB2_2: ; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232_DPP-NEXT: .LBB2_2: ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -1637,13 +1698,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -1657,8 +1720,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB3_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: .LBB3_2: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -1675,14 +1738,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: add_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB3_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s8, s2 @@ -1696,8 +1761,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB3_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB3_2: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 @@ -1712,14 +1777,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: add_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 @@ -1733,8 +1800,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB3_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB3_2: ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 @@ -1750,12 +1817,14 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB3_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -1770,9 +1839,9 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB3_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB3_2: ; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1786,11 +1855,13 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB3_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -1805,9 +1876,9 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB3_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB3_2: ; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1826,8 +1897,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB3_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -1842,8 +1915,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB3_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB3_2: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 @@ -1864,8 +1937,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB3_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -1879,8 +1954,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB3_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: .LBB3_2: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 @@ -1897,14 +1972,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_mov_b32 s9, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1264-NEXT: s_cbranch_execz .LBB3_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1264-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 @@ -1919,8 +1996,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB3_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264-NEXT: .LBB3_2: ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 @@ -1937,13 +2014,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232-NEXT: s_mov_b32 s4, exec_lo -; GFX1232-NEXT: s_mov_b32 s5, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 ; GFX1232-NEXT: s_mov_b32 s6, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 +; GFX1232-NEXT: s_mov_b32 s5, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1232-NEXT: s_cbranch_execz .LBB3_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1232-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 @@ -1957,8 +2036,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB3_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232-NEXT: .LBB3_2: ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 @@ -1979,33 +2058,35 @@ entry: define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 %additive) { ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[10:11], exec ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s10, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s11, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[10:11] +; GFX7LESS-NEXT: s_mul_i32 s3, s1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: s_mul_i32 s2, s0, s2 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s3, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: .LBB4_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 @@ -2024,31 +2105,33 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b64 s[10:11], exec ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b64 s[8:9], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[8:9], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB4_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 -; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 -; GFX8-NEXT: s_mul_i32 s6, s1, s6 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v0, 0 +; GFX8-NEXT: s_mul_i32 s2, s1, s6 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: .LBB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -2064,33 +2147,35 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b64 s[10:11], exec ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[10:11] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s1, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s0, s6 +; GFX9-NEXT: s_mul_i32 s3, s1, s2 +; GFX9-NEXT: s_mul_hi_u32 s6, s0, s2 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_mul_i32 s2, s0, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -2108,33 +2193,35 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[10:11] +; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s9, s1, s8 -; GFX1064-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1064-NEXT: s_mul_i32 s8, s0, s8 -; GFX1064-NEXT: s_add_i32 s10, s10, s9 -; GFX1064-NEXT: v_mov_b32_e32 v0, s8 +; GFX1064-NEXT: s_mul_i32 s3, s1, s2 +; GFX1064-NEXT: s_mul_hi_u32 s10, s0, s2 +; GFX1064-NEXT: s_mul_i32 s2, s0, s2 +; GFX1064-NEXT: s_add_i32 s10, s10, s3 +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 ; GFX1064-NEXT: v_mov_b32_e32 v1, s10 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s8, s6 -; GFX1064-NEXT: s_mov_b32 s9, s7 -; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2150,32 +2237,34 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s8 -; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s9 +; GFX1032-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s8, s1, s3 -; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1032-NEXT: s_mul_i32 s3, s0, s3 -; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: s_mul_i32 s3, s1, s2 +; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s2 +; GFX1032-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032-NEXT: s_add_i32 s9, s9, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s8, s6 -; GFX1032-NEXT: s_mov_b32 s9, s7 -; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2191,33 +2280,35 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1164-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9] -; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[10:11] +; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s9, s1, s8 -; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1164-NEXT: s_mul_i32 s8, s0, s8 -; GFX1164-NEXT: s_add_i32 s10, s10, s9 -; GFX1164-NEXT: v_mov_b32_e32 v0, s8 +; GFX1164-NEXT: s_mul_i32 s3, s1, s2 +; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s2 +; GFX1164-NEXT: s_mul_i32 s2, s0, s2 +; GFX1164-NEXT: s_add_i32 s10, s10, s3 +; GFX1164-NEXT: v_mov_b32_e32 v0, s2 ; GFX1164-NEXT: v_mov_b32_e32 v1, s10 -; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: s_mov_b32 s8, s6 -; GFX1164-NEXT: s_mov_b32 s9, s7 -; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc +; GFX1164-NEXT: s_mov_b32 s14, -1 +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-NEXT: .LBB4_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -2238,32 +2329,34 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1132-NEXT: s_mov_b32 s9, exec_lo ; GFX1132-NEXT: s_mov_b32 s8, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s8 -; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s9 +; GFX1132-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s8, s1, s3 -; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1132-NEXT: s_mul_i32 s3, s0, s3 -; GFX1132-NEXT: s_add_i32 s9, s9, s8 +; GFX1132-NEXT: s_mul_i32 s3, s1, s2 +; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s2 +; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_add_i32 s9, s9, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9 -; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: s_mov_b32 s8, s6 -; GFX1132-NEXT: s_mov_b32 s9, s7 -; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc +; GFX1132-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s9 +; GFX1132-NEXT: s_mov_b32 s14, -1 +; GFX1132-NEXT: s_mov_b32 s12, s6 +; GFX1132-NEXT: s_mov_b32 s13, s7 +; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1132-NEXT: .LBB4_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -2284,31 +2377,33 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_clause 0x1 ; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1264-NEXT: s_mov_b64 s[10:11], exec ; GFX1264-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-NEXT: s_mov_b32 s11, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX1264-NEXT: s_mov_b32 s3, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1264-NEXT: s_cbranch_execz .LBB4_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1264-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] +; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[10:11] +; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-NEXT: v_mov_b32_e32 v0, s8 -; GFX1264-NEXT: v_mov_b32_e32 v1, s9 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: s_mov_b32 s8, s6 -; GFX1264-NEXT: s_mov_b32 s9, s7 +; GFX1264-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] +; GFX1264-NEXT: s_mov_b32 s14, -1 +; GFX1264-NEXT: v_mov_b32_e32 v0, s2 +; GFX1264-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-NEXT: s_mov_b32 s12, s6 +; GFX1264-NEXT: s_mov_b32 s13, s7 ; GFX1264-NEXT: global_wb scope:SCOPE_DEV -; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV +; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264-NEXT: .LBB4_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: s_wait_kmcnt 0x0 @@ -2328,13 +2423,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1232-NEXT: s_mov_b32 s9, exec_lo -; GFX1232-NEXT: s_mov_b32 s3, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 ; GFX1232-NEXT: s_mov_b32 s8, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 +; GFX1232-NEXT: s_mov_b32 s3, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1232-NEXT: s_cbranch_execz .LBB4_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1232-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 @@ -2348,8 +2445,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB4_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232-NEXT: .LBB4_2: ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -2395,10 +2492,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 @@ -2410,8 +2508,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 -; GFX7LESS_ITERATIVE-NEXT: .LBB5_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: .LBB5_4: ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -2449,10 +2547,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 @@ -2464,8 +2563,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol -; GFX8_ITERATIVE-NEXT: .LBB5_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8_ITERATIVE-NEXT: .LBB5_4: ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 @@ -2502,10 +2601,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 @@ -2517,8 +2617,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol -; GFX9_ITERATIVE-NEXT: .LBB5_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: .LBB5_4: ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 @@ -2555,9 +2655,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 @@ -2570,9 +2671,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB5_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: .LBB5_4: ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v3 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2607,9 +2708,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 @@ -2622,9 +2724,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB5_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: .LBB5_4: ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v3 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2665,10 +2767,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 @@ -2681,8 +2783,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB5_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: .LBB5_4: ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -2721,12 +2823,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -2738,8 +2841,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB5_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: .LBB5_4: ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -2780,10 +2883,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1264_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1264_ITERATIVE-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 @@ -2796,8 +2899,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV -; GFX1264_ITERATIVE-NEXT: .LBB5_4: ; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: .LBB5_4: ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -2833,12 +2936,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo -; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 -; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, vcc_lo, exec_lo +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: ; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -2850,8 +2954,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV -; GFX1232_ITERATIVE-NEXT: .LBB5_4: ; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: .LBB5_4: ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -2888,20 +2992,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -2949,16 +3054,18 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s9, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s8, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s7 ; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s10, -1 @@ -2969,8 +3076,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc ; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX8_DPP-NEXT: buffer_wbinvl1_vol -; GFX8_DPP-NEXT: .LBB5_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_DPP-NEXT: .LBB5_2: ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -2988,20 +3095,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -3049,16 +3157,18 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s9, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s8, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: s_mov_b64 s[2:3], s[8:9] ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s3 ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 @@ -3069,8 +3179,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9_DPP-NEXT: buffer_wbinvl1_vol -; GFX9_DPP-NEXT: .LBB5_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB5_2: ; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v8 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -3088,6 +3198,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3147,32 +3258,35 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v3, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s2, 16 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s3, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v3, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s12, v3, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s13, v4, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s10, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s11, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s13, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s12, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 @@ -3184,9 +3298,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl1_inv ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB5_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064_DPP-NEXT: .LBB5_2: ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2 @@ -3203,6 +3317,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3248,26 +3363,29 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s8, v4, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s9, v4, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s9, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 @@ -3279,9 +3397,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB5_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032_DPP-NEXT: .LBB5_2: ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 @@ -3296,6 +3414,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_DPP-LABEL: add_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3361,33 +3480,36 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s12, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s13, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 32 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s12, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s13, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s5 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s4 @@ -3399,8 +3521,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl1_inv ; GFX1164_DPP-NEXT: buffer_gl0_inv -; GFX1164_DPP-NEXT: .LBB5_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164_DPP-NEXT: .LBB5_2: ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v8 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1 @@ -3418,6 +3540,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132_DPP-LABEL: add_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3460,32 +3583,34 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s9, v3, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s9, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 @@ -3496,8 +3621,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl1_inv ; GFX1132_DPP-NEXT: buffer_gl0_inv -; GFX1132_DPP-NEXT: .LBB5_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132_DPP-NEXT: .LBB5_2: ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v8 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 @@ -3515,6 +3640,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_DPP-LABEL: add_i64_varying: ; GFX1264_DPP: ; %bb.0: ; %entry +; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3580,33 +3706,36 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1264_DPP-NEXT: v_readlane_b32 s8, v4, 31 -; GFX1264_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_readlane_b32 s10, v4, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1264_DPP-NEXT: v_readlane_b32 s11, v3, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v2, s7, 16 -; GFX1264_DPP-NEXT: v_readlane_b32 s10, v4, 47 -; GFX1264_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1264_DPP-NEXT: v_readlane_b32 s12, v4, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s13, v3, 47 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 32 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 32 ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63 -; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32 -; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s12, 48 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s13, 48 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48 -; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48 -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1264_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1264_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1264_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1264_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1264_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1264_DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1264_DPP-NEXT: ; %bb.1: ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, s5 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, s4 @@ -3618,8 +3747,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV -; GFX1264_DPP-NEXT: .LBB5_2: ; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264_DPP-NEXT: .LBB5_2: ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v10, v1 @@ -3637,6 +3766,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_DPP-LABEL: add_i64_varying: ; GFX1232_DPP: ; %bb.0: ; %entry +; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3678,32 +3808,34 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 ; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s7, v4, 15 -; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15 -; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s9, v3, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 -; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16 -; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1232_DPP-NEXT: v_writelane_b32 v2, s9, 16 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1232_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1232_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1232_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1232_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1232_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232_DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1232_DPP-NEXT: ; %bb.1: ; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 ; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 @@ -3714,8 +3846,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV -; GFX1232_DPP-NEXT: .LBB5_2: ; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232_DPP-NEXT: .LBB5_2: ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v1 @@ -3742,13 +3874,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB6_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -3761,8 +3895,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB6_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: .LBB6_2: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -3774,14 +3908,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB6_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s8, s2 @@ -3794,8 +3930,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3807,14 +3943,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 @@ -3827,8 +3965,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3842,12 +3980,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB6_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 @@ -3861,9 +4001,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB6_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB6_2: ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3877,11 +4017,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB6_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 @@ -3895,9 +4037,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB6_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB6_2: ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -3916,8 +4058,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB6_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 @@ -3931,8 +4075,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB6_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB6_2: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -3953,8 +4097,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB6_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -3968,8 +4114,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB6_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: .LBB6_2: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -3991,8 +4137,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1264-NEXT: s_cbranch_execz .LBB6_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 @@ -4006,8 +4154,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB6_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264-NEXT: .LBB6_2: ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -4028,8 +4176,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1232-NEXT: s_cbranch_execz .LBB6_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 @@ -4043,8 +4193,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB6_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1232-NEXT: .LBB6_2: ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -4066,14 +4216,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX7LESS-NEXT: s_load_dword s2, s[2:3], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[8:9] @@ -4086,8 +4238,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB7_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: .LBB7_2: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 @@ -4099,31 +4251,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x34 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB7_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8-NEXT: s_bcnt1_i32_b64 s3, s[8:9] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 +; GFX8-NEXT: s_mul_i32 s3, s2, s3 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: .LBB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 @@ -4133,31 +4287,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_load_dword s10, s[2:3], 0x34 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 +; GFX9-NEXT: s_mul_i32 s2, s10, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s10, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -4171,12 +4327,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_load_dword s10, s[2:3], 0x34 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB7_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 @@ -4190,9 +4348,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB7_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB7_2: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s10, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 @@ -4208,11 +4366,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x34 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB7_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 @@ -4226,9 +4386,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 @@ -4249,8 +4409,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB7_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 @@ -4264,8 +4426,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB7_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: .LBB7_2: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 @@ -4288,8 +4450,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB7_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -4303,8 +4467,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB7_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: .LBB7_2: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 @@ -4328,8 +4492,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1264-NEXT: s_cbranch_execz .LBB7_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 @@ -4343,8 +4509,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB7_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1264-NEXT: .LBB7_2: ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 @@ -4367,8 +4533,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1232-NEXT: s_cbranch_execz .LBB7_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 @@ -4382,8 +4550,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB7_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1232-NEXT: .LBB7_2: ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s0, v1 @@ -4424,10 +4592,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 @@ -4438,8 +4607,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 -; GFX7LESS_ITERATIVE-NEXT: .LBB8_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: .LBB8_4: ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -4470,10 +4639,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 @@ -4484,8 +4654,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol -; GFX8_ITERATIVE-NEXT: .LBB8_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: .LBB8_4: ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -4515,10 +4685,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: s_mov_b32 s15, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s14, -1 @@ -4529,8 +4700,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[12:15], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol -; GFX9_ITERATIVE-NEXT: .LBB8_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: .LBB8_4: ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: s_mov_b32 s7, 0xf000 @@ -4560,9 +4731,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -4574,9 +4746,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB8_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: .LBB8_4: ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 @@ -4605,9 +4777,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -4619,9 +4792,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB8_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: .LBB8_4: ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 @@ -4655,10 +4828,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -4670,8 +4843,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB8_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: .LBB8_4: ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -4704,12 +4877,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -4721,8 +4895,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB8_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: .LBB8_4: ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -4759,10 +4933,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1264_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1264_ITERATIVE-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -4774,8 +4948,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV -; GFX1264_ITERATIVE-NEXT: .LBB8_4: ; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264_ITERATIVE-NEXT: .LBB8_4: ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -4808,12 +4982,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, vcc_lo, exec_lo +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: ; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -4825,8 +5000,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV -; GFX1232_ITERATIVE-NEXT: .LBB8_4: ; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: .LBB8_4: ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -4859,16 +5034,17 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-LABEL: sub_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4880,15 +5056,17 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s8, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: s_mov_b32 s6, s8 ; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s10, -1 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4898,8 +5076,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX8_DPP-NEXT: buffer_wbinvl1_vol -; GFX8_DPP-NEXT: .LBB8_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_DPP-NEXT: .LBB8_2: ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4912,16 +5090,17 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-LABEL: sub_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4933,15 +5112,17 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s8, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: s_mov_b32 s2, s8 ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4951,8 +5132,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9_DPP-NEXT: buffer_wbinvl1_vol -; GFX9_DPP-NEXT: .LBB8_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB8_2: ; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4964,6 +5145,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064_DPP-LABEL: sub_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -4980,30 +5162,31 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v1, 15 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s10, 16 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 47 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064_DPP-NEXT: s_mov_b32 s0, s9 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 @@ -5014,9 +5197,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl1_inv ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB8_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064_DPP-NEXT: .LBB8_2: ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5028,6 +5211,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032_DPP-LABEL: sub_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -5042,22 +5226,25 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 @@ -5068,9 +5255,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB8_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032_DPP-NEXT: .LBB8_2: ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5083,6 +5270,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-LABEL: sub_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec @@ -5109,30 +5297,29 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b32 s4, s9 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 @@ -5141,8 +5328,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl1_inv ; GFX1164_DPP-NEXT: buffer_gl0_inv -; GFX1164_DPP-NEXT: .LBB8_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164_DPP-NEXT: .LBB8_2: ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -5158,6 +5345,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-LABEL: sub_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo @@ -5171,30 +5359,32 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 @@ -5205,8 +5395,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl1_inv ; GFX1132_DPP-NEXT: buffer_gl0_inv -; GFX1132_DPP-NEXT: .LBB8_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132_DPP-NEXT: .LBB8_2: ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -5222,6 +5412,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-LABEL: sub_i32_varying: ; GFX1264_DPP: ; %bb.0: ; %entry ; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1264_DPP-NEXT: s_not_b64 exec, exec @@ -5248,30 +5439,29 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1264_DPP-NEXT: s_mov_b32 s4, s9 -; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1264_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1264_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1264_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1264_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1264_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1264_DPP-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1264_DPP-NEXT: ; %bb.1: -; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, s7 ; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 @@ -5280,8 +5470,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV -; GFX1264_DPP-NEXT: .LBB8_2: ; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264_DPP-NEXT: .LBB8_2: ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -5297,6 +5487,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-LABEL: sub_i32_varying: ; GFX1232_DPP: ; %bb.0: ; %entry ; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo @@ -5310,30 +5501,32 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1232_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1232_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232_DPP-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1232_DPP-NEXT: ; %bb.1: ; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 @@ -5344,8 +5537,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV -; GFX1232_DPP-NEXT: .LBB8_2: ; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232_DPP-NEXT: .LBB8_2: ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -5368,13 +5561,15 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -5388,8 +5583,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB9_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: .LBB9_2: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -5406,14 +5601,16 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB9_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s8, s2 @@ -5427,8 +5624,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_readfirstlane_b32 s5, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -5444,14 +5641,16 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 @@ -5465,8 +5664,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -5484,12 +5683,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB9_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -5504,9 +5705,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB9_2: ; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 @@ -5523,11 +5724,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB9_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -5542,9 +5745,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 @@ -5566,8 +5769,10 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB9_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -5582,8 +5787,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB9_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB9_2: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -5607,8 +5812,10 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB9_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -5622,8 +5829,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB9_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: .LBB9_2: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -5643,14 +5850,16 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_mov_b32 s9, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1264-NEXT: s_cbranch_execz .LBB9_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1264-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 @@ -5665,8 +5874,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB9_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264-NEXT: .LBB9_2: ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -5686,13 +5895,15 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232-NEXT: s_mov_b32 s4, exec_lo -; GFX1232-NEXT: s_mov_b32 s5, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 ; GFX1232-NEXT: s_mov_b32 s6, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 +; GFX1232-NEXT: s_mov_b32 s5, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1232-NEXT: s_cbranch_execz .LBB9_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1232-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 @@ -5706,8 +5917,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB9_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232-NEXT: .LBB9_2: ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -5731,33 +5942,35 @@ entry: define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 %subitive) { ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[10:11], exec ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s10, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s11, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB10_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[10:11] +; GFX7LESS-NEXT: s_mul_i32 s3, s1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: s_mul_i32 s2, s0, s2 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s3, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: .LBB10_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 @@ -5776,31 +5989,33 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b64 s[10:11], exec ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b64 s[8:9], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[8:9], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB10_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 -; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 -; GFX8-NEXT: s_mul_i32 s6, s1, s6 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v0, 0 +; GFX8-NEXT: s_mul_i32 s2, s1, s6 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: .LBB10_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0 @@ -5817,33 +6032,35 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b64 s[10:11], exec ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[10:11] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s1, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s0, s6 +; GFX9-NEXT: s_mul_i32 s3, s1, s2 +; GFX9-NEXT: s_mul_hi_u32 s6, s0, s2 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_mul_i32 s2, s0, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: .LBB10_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s0, v2, 0 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 @@ -5863,33 +6080,35 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB10_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[10:11] +; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s9, s1, s8 -; GFX1064-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1064-NEXT: s_mul_i32 s8, s0, s8 -; GFX1064-NEXT: s_add_i32 s10, s10, s9 -; GFX1064-NEXT: v_mov_b32_e32 v0, s8 +; GFX1064-NEXT: s_mul_i32 s3, s1, s2 +; GFX1064-NEXT: s_mul_hi_u32 s10, s0, s2 +; GFX1064-NEXT: s_mul_i32 s2, s0, s2 +; GFX1064-NEXT: s_add_i32 s10, s10, s3 +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 ; GFX1064-NEXT: v_mov_b32_e32 v1, s10 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s8, s6 -; GFX1064-NEXT: s_mov_b32 s9, s7 -; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB10_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: .LBB10_2: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s0, v2, 0 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 @@ -5908,32 +6127,34 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB10_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s8 -; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s9 +; GFX1032-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s8, s1, s3 -; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1032-NEXT: s_mul_i32 s3, s0, s3 -; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: s_mul_i32 s3, s1, s2 +; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s2 +; GFX1032-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032-NEXT: s_add_i32 s9, s9, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s8, s6 -; GFX1032-NEXT: s_mov_b32 s9, s7 -; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB10_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: .LBB10_2: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s0, v2, 0 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 @@ -5952,33 +6173,35 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1164-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB10_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9] -; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[10:11] +; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s9, s1, s8 -; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1164-NEXT: s_mul_i32 s8, s0, s8 -; GFX1164-NEXT: s_add_i32 s10, s10, s9 -; GFX1164-NEXT: v_mov_b32_e32 v0, s8 +; GFX1164-NEXT: s_mul_i32 s3, s1, s2 +; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s2 +; GFX1164-NEXT: s_mul_i32 s2, s0, s2 +; GFX1164-NEXT: s_add_i32 s10, s10, s3 +; GFX1164-NEXT: v_mov_b32_e32 v0, s2 ; GFX1164-NEXT: v_mov_b32_e32 v1, s10 -; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: s_mov_b32 s8, s6 -; GFX1164-NEXT: s_mov_b32 s9, s7 -; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc +; GFX1164-NEXT: s_mov_b32 s14, -1 +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-NEXT: .LBB10_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0 ; GFX1164-NEXT: v_readfirstlane_b32 s0, v0 @@ -6001,32 +6224,34 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1132-NEXT: s_mov_b32 s9, exec_lo ; GFX1132-NEXT: s_mov_b32 s8, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB10_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s8 -; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s9 +; GFX1132-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s8, s1, s3 -; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1132-NEXT: s_mul_i32 s3, s0, s3 -; GFX1132-NEXT: s_add_i32 s9, s9, s8 +; GFX1132-NEXT: s_mul_i32 s3, s1, s2 +; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s2 +; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_add_i32 s9, s9, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9 -; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: s_mov_b32 s8, s6 -; GFX1132-NEXT: s_mov_b32 s9, s7 -; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc +; GFX1132-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s9 +; GFX1132-NEXT: s_mov_b32 s14, -1 +; GFX1132-NEXT: s_mov_b32 s12, s6 +; GFX1132-NEXT: s_mov_b32 s13, s7 +; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1132-NEXT: .LBB10_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0 ; GFX1132-NEXT: v_readfirstlane_b32 s0, v0 @@ -6049,31 +6274,33 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_clause 0x1 ; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1264-NEXT: s_mov_b64 s[10:11], exec ; GFX1264-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-NEXT: s_mov_b32 s11, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX1264-NEXT: s_mov_b32 s3, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1264-NEXT: s_cbranch_execz .LBB10_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1264-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] +; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[10:11] +; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-NEXT: v_mov_b32_e32 v0, s8 -; GFX1264-NEXT: v_mov_b32_e32 v1, s9 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: s_mov_b32 s8, s6 -; GFX1264-NEXT: s_mov_b32 s9, s7 +; GFX1264-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] +; GFX1264-NEXT: s_mov_b32 s14, -1 +; GFX1264-NEXT: v_mov_b32_e32 v0, s2 +; GFX1264-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-NEXT: s_mov_b32 s12, s6 +; GFX1264-NEXT: s_mov_b32 s13, s7 ; GFX1264-NEXT: global_wb scope:SCOPE_DEV -; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV +; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264-NEXT: .LBB10_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0 ; GFX1264-NEXT: v_readfirstlane_b32 s0, v0 @@ -6097,13 +6324,15 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 ; GFX1232-NEXT: s_mov_b32 s9, exec_lo -; GFX1232-NEXT: s_mov_b32 s3, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 ; GFX1232-NEXT: s_mov_b32 s8, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 +; GFX1232-NEXT: s_mov_b32 s3, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1232-NEXT: s_cbranch_execz .LBB10_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1232-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 @@ -6117,8 +6346,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB10_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232-NEXT: .LBB10_2: ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0 ; GFX1232-NEXT: v_readfirstlane_b32 s0, v0 @@ -6168,10 +6397,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 @@ -6183,8 +6413,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 -; GFX7LESS_ITERATIVE-NEXT: .LBB11_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: .LBB11_4: ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -6222,10 +6452,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 @@ -6237,8 +6468,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc ; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol -; GFX8_ITERATIVE-NEXT: .LBB11_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8_ITERATIVE-NEXT: .LBB11_4: ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 @@ -6275,10 +6506,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 @@ -6290,8 +6522,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol -; GFX9_ITERATIVE-NEXT: .LBB11_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: .LBB11_4: ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 @@ -6328,9 +6560,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 @@ -6343,9 +6576,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB11_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: .LBB11_4: ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v3 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -6380,9 +6613,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 @@ -6395,9 +6629,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB11_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: .LBB11_4: ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v3 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -6438,10 +6672,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 @@ -6454,8 +6688,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB11_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: .LBB11_4: ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -6494,12 +6728,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -6511,8 +6746,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB11_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: .LBB11_4: ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -6553,10 +6788,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1264_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1264_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 @@ -6569,8 +6804,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV -; GFX1264_ITERATIVE-NEXT: .LBB11_4: ; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: .LBB11_4: ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -6606,12 +6841,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo -; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 -; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, vcc_lo, exec_lo +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: ; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -6623,8 +6859,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV -; GFX1232_ITERATIVE-NEXT: .LBB11_4: ; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: .LBB11_4: ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -6661,20 +6897,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: s_mov_b64 s[4:5], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -6722,16 +6959,18 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s9, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s8, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s7 ; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s10, -1 @@ -6742,8 +6981,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: buffer_atomic_sub_x2 v[7:8], off, s[8:11], 0 glc ; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX8_DPP-NEXT: buffer_wbinvl1_vol -; GFX8_DPP-NEXT: .LBB11_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_DPP-NEXT: .LBB11_2: ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -6761,20 +7000,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -6822,16 +7062,18 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s9, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s8, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: s_mov_b64 s[2:3], s[8:9] ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s3 ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 @@ -6842,8 +7084,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: buffer_atomic_sub_x2 v[7:8], off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9_DPP-NEXT: buffer_wbinvl1_vol -; GFX9_DPP-NEXT: .LBB11_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB11_2: ; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v8 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -6861,6 +7103,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -6920,32 +7163,35 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v3, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s2, 16 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s3, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s2, v3, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s12, v3, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s13, v4, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s10, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s11, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s13, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s12, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 @@ -6957,9 +7203,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl1_inv ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB11_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064_DPP-NEXT: .LBB11_2: ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2 @@ -6976,6 +7222,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -7021,26 +7268,29 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s8, v4, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s9, v4, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s9, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 @@ -7052,9 +7302,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB11_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032_DPP-NEXT: .LBB11_2: ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 @@ -7069,6 +7319,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_DPP-LABEL: sub_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -7134,33 +7385,36 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s12, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s13, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 32 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s12, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s13, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s5 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s4 @@ -7172,8 +7426,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl1_inv ; GFX1164_DPP-NEXT: buffer_gl0_inv -; GFX1164_DPP-NEXT: .LBB11_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164_DPP-NEXT: .LBB11_2: ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v8 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1 @@ -7191,6 +7445,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132_DPP-LABEL: sub_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -7233,32 +7488,34 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s9, v3, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s9, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 @@ -7269,8 +7526,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl1_inv ; GFX1132_DPP-NEXT: buffer_gl0_inv -; GFX1132_DPP-NEXT: .LBB11_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132_DPP-NEXT: .LBB11_2: ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v8 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 @@ -7288,6 +7545,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_DPP-LABEL: sub_i64_varying: ; GFX1264_DPP: ; %bb.0: ; %entry +; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -7353,33 +7611,36 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1264_DPP-NEXT: v_readlane_b32 s8, v4, 31 -; GFX1264_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_readlane_b32 s10, v4, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1264_DPP-NEXT: v_readlane_b32 s11, v3, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v2, s7, 16 -; GFX1264_DPP-NEXT: v_readlane_b32 s10, v4, 47 -; GFX1264_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1264_DPP-NEXT: v_readlane_b32 s12, v4, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s13, v3, 47 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 32 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 32 ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63 -; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32 -; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s12, 48 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s13, 48 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48 -; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48 -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1264_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1264_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1264_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1264_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1264_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1264_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1264_DPP-NEXT: ; %bb.1: ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, s5 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, s4 @@ -7391,8 +7652,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV -; GFX1264_DPP-NEXT: .LBB11_2: ; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264_DPP-NEXT: .LBB11_2: ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v10, v1 @@ -7410,6 +7671,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_DPP-LABEL: sub_i64_varying: ; GFX1232_DPP: ; %bb.0: ; %entry +; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -7451,32 +7713,34 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 ; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s7, v4, 15 -; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15 -; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s9, v3, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 -; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16 -; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1232_DPP-NEXT: v_writelane_b32 v2, s9, 16 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1232_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1232_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1232_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1232_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1232_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1232_DPP-NEXT: ; %bb.1: ; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 ; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 @@ -7487,8 +7751,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV -; GFX1232_DPP-NEXT: .LBB11_2: ; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232_DPP-NEXT: .LBB11_2: ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index c7296185422cef..39319940426a49 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -25,12 +25,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 @@ -39,8 +41,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB0_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: .LBB0_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -56,9 +58,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 @@ -67,8 +71,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -84,9 +88,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -94,8 +100,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -108,12 +114,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -122,9 +130,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -138,11 +146,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -151,9 +161,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 @@ -172,8 +182,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -183,8 +195,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB0_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: .LBB0_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -204,8 +216,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -214,8 +228,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -237,13 +251,15 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -253,8 +269,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB1_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: .LBB1_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -267,14 +283,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -284,8 +302,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -298,14 +316,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -314,8 +334,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -330,12 +350,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB1_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -345,9 +367,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -362,11 +384,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1032-NEXT: s_mov_b32 s4, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB1_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -376,9 +400,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 @@ -398,8 +422,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB1_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -410,8 +436,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB1_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: .LBB1_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -432,8 +458,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB1_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -443,8 +471,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB1_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: .LBB1_2: ; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 @@ -484,18 +512,19 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB2_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: .LBB2_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -525,18 +554,19 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB2_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: .LBB2_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -566,17 +596,18 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX9_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB2_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: .LBB2_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -606,18 +637,19 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB2_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: .LBB2_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -647,18 +679,19 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB2_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: .LBB2_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -693,18 +726,18 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB2_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: .LBB2_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -737,19 +770,20 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB2_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: .LBB2_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -777,6 +811,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: add_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 @@ -784,7 +819,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 @@ -797,21 +832,22 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB2_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB2_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -824,6 +860,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: add_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 @@ -831,7 +868,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 @@ -844,20 +881,21 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX9_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB2_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB2_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -870,11 +908,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: add_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -883,38 +922,42 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1064_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB2_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB2_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -926,11 +969,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: add_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -939,28 +983,31 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX1032_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB2_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032_DPP-NEXT: .LBB2_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -973,12 +1020,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: add_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -991,40 +1039,43 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1164_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv -; GFX1164_DPP-NEXT: .LBB2_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: .LBB2_2: ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -1040,12 +1091,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: add_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1057,28 +1109,31 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX1132_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132_DPP-NEXT: .LBB2_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -1116,9 +1171,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 @@ -1145,9 +1201,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 @@ -1174,9 +1231,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 @@ -1202,9 +1260,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 @@ -1230,9 +1289,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s0 @@ -1260,12 +1320,13 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 @@ -1293,11 +1354,12 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1132_ITERATIVE-NEXT: ds_add_u32 v0, v1 @@ -1337,12 +1399,12 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: s_mov_b32 s0, s2 ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_add_u32 v2, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1372,12 +1434,12 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: s_mov_b32 s0, s2 ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX9_DPP-NEXT: ds_add_u32 v2, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB3_2: @@ -1407,8 +1469,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_add_i32 s0, s2, s3 ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_DPP-NEXT: ds_add_u32 v0, v3 @@ -1430,14 +1493,17 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: ds_add_u32 v0, v3 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1464,22 +1530,25 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: ds_add_u32 v0, v3 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1503,17 +1572,20 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: ds_add_u32 v0, v3 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1530,12 +1602,14 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 @@ -1544,8 +1618,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB4_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: .LBB4_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -1566,9 +1640,11 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB4_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 @@ -1577,8 +1653,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 @@ -1598,9 +1674,11 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -1608,8 +1686,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 @@ -1626,12 +1704,14 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -1640,9 +1720,9 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -1657,11 +1737,13 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -1670,9 +1752,9 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -1692,8 +1774,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -1703,8 +1787,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB4_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: .LBB4_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 @@ -1725,8 +1809,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 @@ -1736,8 +1822,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB4_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: .LBB4_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 @@ -1760,13 +1846,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 @@ -1780,8 +1868,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB5_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: .LBB5_2: ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1801,14 +1889,16 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB5_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, s8 @@ -1820,8 +1910,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_readfirstlane_b32 s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 @@ -1839,14 +1929,16 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1859,8 +1951,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 @@ -1880,12 +1972,14 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -1899,9 +1993,9 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB5_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB5_2: ; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1916,11 +2010,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -1934,9 +2030,9 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB5_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB5_2: ; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1956,8 +2052,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -1971,8 +2069,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB5_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB5_2: ; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -1996,8 +2094,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -2011,8 +2111,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB5_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: .LBB5_2: ; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -2059,10 +2159,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -2070,8 +2171,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB6_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: .LBB6_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -2108,10 +2209,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -2119,8 +2221,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB6_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: .LBB6_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 @@ -2157,18 +2259,19 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB6_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: .LBB6_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 @@ -2205,9 +2308,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -2215,9 +2319,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB6_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: .LBB6_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 @@ -2253,9 +2357,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -2263,9 +2368,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB6_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: .LBB6_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 @@ -2307,10 +2412,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 @@ -2318,8 +2423,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB6_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: .LBB6_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -2358,20 +2463,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u64 v[2:3], v4, v[2:3] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB6_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: .LBB6_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -2402,20 +2508,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: add_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -2463,23 +2570,24 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB6_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB6_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10 @@ -2497,20 +2605,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: add_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -2558,22 +2667,23 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s6 ; GFX9_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB6_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB6_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10 @@ -2591,18 +2701,19 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-LABEL: add_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -2639,52 +2750,55 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s7 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB6_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB6_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 @@ -2700,6 +2814,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-LABEL: add_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -2711,7 +2826,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -2746,32 +2861,35 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 ; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB6_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: .LBB6_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 @@ -2786,15 +2904,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: add_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 @@ -2802,7 +2921,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -2836,56 +2955,59 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 31 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s7 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s4 ; GFX1164_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164_DPP-NEXT: .LBB6_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1 @@ -2903,6 +3025,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_DPP-LABEL: add_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2918,7 +3041,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -2948,33 +3071,35 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 ; GFX1132_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv -; GFX1132_DPP-NEXT: .LBB6_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: .LBB6_2: ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1 @@ -3020,9 +3145,10 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 @@ -3053,9 +3179,10 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 @@ -3086,9 +3213,10 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 @@ -3118,9 +3246,10 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 @@ -3150,9 +3279,10 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 @@ -3185,12 +3315,13 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 @@ -3222,11 +3353,12 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 @@ -3308,13 +3440,13 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_DPP-NEXT: v_readlane_b32 s3, v2, 63 ; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_add_u64 v8, v[9:10] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3385,13 +3517,13 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_DPP-NEXT: v_readlane_b32 s3, v2, 63 ; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s2 ; GFX9_DPP-NEXT: ds_add_u64 v8, v[9:10] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB7_2: @@ -3452,8 +3584,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_DPP-NEXT: s_add_u32 s0, s3, s4 ; GFX1064_DPP-NEXT: s_addc_u32 s1, s2, s5 ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 @@ -3504,15 +3637,18 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: ds_add_u64 v10, v[11:12] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3568,23 +3704,26 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v3, v2 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_permlane64_b32 v3, v2 ; GFX1164_DPP-NEXT: v_permlane64_b32 v4, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff ; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: ds_add_u64 v7, v[8:9] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3634,19 +3773,21 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: ds_add_u64 v7, v[8:9] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3664,12 +3805,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 @@ -3678,8 +3821,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB8_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: .LBB8_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -3696,9 +3839,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB8_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 @@ -3707,8 +3852,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB8_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB8_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -3725,9 +3870,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -3735,8 +3882,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -3750,12 +3897,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: sub_i32_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -3764,9 +3913,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -3781,11 +3930,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: sub_i32_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -3794,9 +3945,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 @@ -3816,8 +3967,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB8_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -3827,8 +3980,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB8_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: .LBB8_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -3849,8 +4002,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB8_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -3859,8 +4014,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB8_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: .LBB8_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -3883,13 +4038,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3899,8 +4056,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB9_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: .LBB9_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -3913,14 +4070,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB9_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3930,8 +4089,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -3944,14 +4103,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3960,8 +4121,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -3976,12 +4137,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB9_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -3991,9 +4154,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB9_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -4008,11 +4171,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1032-NEXT: s_mov_b32 s4, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB9_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -4022,9 +4187,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -4044,8 +4209,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB9_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -4056,8 +4223,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB9_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: .LBB9_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -4079,8 +4246,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB9_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -4090,8 +4259,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB9_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: .LBB9_2: ; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -4132,18 +4301,19 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB10_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: .LBB10_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -4173,18 +4343,19 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB10_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: .LBB10_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -4214,17 +4385,18 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB10_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: .LBB10_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -4254,18 +4426,19 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB10_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: .LBB10_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -4295,18 +4468,19 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB10_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: .LBB10_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -4341,18 +4515,18 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB10_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: .LBB10_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -4385,19 +4559,20 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB10_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: .LBB10_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -4425,6 +4600,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: sub_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 @@ -4432,7 +4608,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 @@ -4445,21 +4621,22 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB10_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB10_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -4472,6 +4649,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: sub_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 @@ -4479,7 +4657,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 @@ -4492,20 +4670,21 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX9_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB10_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB10_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -4518,11 +4697,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: sub_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4531,38 +4711,42 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1064_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB10_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB10_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -4574,11 +4758,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: sub_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4587,28 +4772,31 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX1032_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB10_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032_DPP-NEXT: .LBB10_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -4621,12 +4809,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: sub_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4639,40 +4828,43 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1164_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv -; GFX1164_DPP-NEXT: .LBB10_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: .LBB10_2: ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -4688,12 +4880,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: sub_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4705,28 +4898,31 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX1132_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132_DPP-NEXT: .LBB10_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -4764,9 +4960,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 @@ -4793,9 +4990,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 @@ -4822,9 +5020,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 @@ -4850,9 +5049,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 @@ -4878,9 +5078,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s0 @@ -4908,12 +5109,13 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 @@ -4941,11 +5143,12 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1132_ITERATIVE-NEXT: ds_sub_u32 v0, v1 @@ -4985,12 +5188,12 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: s_mov_b32 s0, s2 ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_sub_u32 v2, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5020,12 +5223,12 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: s_mov_b32 s0, s2 ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX9_DPP-NEXT: ds_sub_u32 v2, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB11_2: @@ -5055,8 +5258,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_add_i32 s0, s2, s3 ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_DPP-NEXT: ds_sub_u32 v0, v3 @@ -5078,14 +5282,17 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: ds_sub_u32 v0, v3 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5112,22 +5319,25 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: ds_sub_u32 v0, v3 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5151,17 +5361,20 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: ds_sub_u32 v0, v3 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5178,12 +5391,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 @@ -5192,8 +5407,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB12_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: .LBB12_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -5214,9 +5429,11 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB12_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 @@ -5225,8 +5442,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB12_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_readfirstlane_b32 s5, v0 @@ -5247,9 +5464,11 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -5257,8 +5476,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB12_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB12_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 @@ -5276,12 +5495,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB12_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -5290,9 +5511,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB12_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB12_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 @@ -5310,11 +5531,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB12_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -5323,9 +5546,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB12_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB12_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 @@ -5348,8 +5571,10 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB12_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -5359,8 +5584,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB12_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: .LBB12_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -5384,8 +5609,10 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB12_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 @@ -5395,8 +5622,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB12_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: .LBB12_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -5422,13 +5649,15 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB13_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 @@ -5442,8 +5671,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB13_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: .LBB13_2: ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -5463,14 +5692,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB13_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, s8 @@ -5482,8 +5713,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB13_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB13_2: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s0 ; GFX8-NEXT: s_mov_b32 s5, s1 @@ -5502,14 +5733,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5522,8 +5755,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB13_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB13_2: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0 ; GFX9-NEXT: s_mov_b32 s0, s4 @@ -5544,12 +5777,14 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB13_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -5563,9 +5798,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB13_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB13_2: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -5583,11 +5818,13 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB13_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -5601,9 +5838,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB13_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB13_2: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s6, v2, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 @@ -5626,8 +5863,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB13_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -5641,8 +5880,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB13_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB13_2: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 @@ -5668,8 +5907,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB13_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -5683,8 +5924,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB13_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: .LBB13_2: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 @@ -5733,10 +5974,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB14_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -5744,8 +5986,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB14_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: .LBB14_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -5782,10 +6024,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB14_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -5793,8 +6036,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB14_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: .LBB14_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 @@ -5831,18 +6074,19 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB14_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB14_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: .LBB14_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 @@ -5879,9 +6123,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB14_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -5889,9 +6134,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB14_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: .LBB14_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 @@ -5927,9 +6172,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB14_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -5937,9 +6183,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB14_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: .LBB14_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 @@ -5981,10 +6227,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB14_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 @@ -5992,8 +6238,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB14_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: .LBB14_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -6032,20 +6278,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB14_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u64 v[2:3], v4, v[2:3] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB14_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: .LBB14_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -6076,20 +6323,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: sub_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -6137,23 +6385,24 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB14_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB14_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB14_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10 @@ -6171,20 +6420,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: sub_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -6232,22 +6482,23 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB14_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s6 ; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB14_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB14_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10 @@ -6265,18 +6516,19 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-LABEL: sub_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -6313,52 +6565,55 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s7 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB14_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB14_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB14_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 @@ -6374,6 +6629,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-LABEL: sub_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -6385,7 +6641,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -6420,32 +6676,35 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB14_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 ; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB14_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: .LBB14_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 @@ -6460,15 +6719,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: sub_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 @@ -6476,7 +6736,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -6510,56 +6770,59 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 31 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s7 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB14_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s4 ; GFX1164_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164_DPP-NEXT: .LBB14_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1 @@ -6577,6 +6840,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_DPP-LABEL: sub_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -6592,7 +6856,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -6622,33 +6886,35 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB14_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 ; GFX1132_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv -; GFX1132_DPP-NEXT: .LBB14_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: .LBB14_2: ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1 @@ -6693,18 +6959,19 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB15_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: .LBB15_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -6734,18 +7001,19 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB15_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: .LBB15_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -6775,17 +7043,18 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX9_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB15_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: .LBB15_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -6815,18 +7084,19 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB15_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: .LBB15_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -6856,18 +7126,19 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB15_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: .LBB15_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -6902,18 +7173,18 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB15_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: .LBB15_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -6946,19 +7217,20 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b32 v1, v1, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB15_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: .LBB15_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -6986,16 +7258,17 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: and_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7007,22 +7280,23 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB15_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB15_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 @@ -7035,16 +7309,17 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: and_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7056,21 +7331,22 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB15_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB15_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 @@ -7083,11 +7359,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: and_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7096,38 +7373,42 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1064_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB15_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB15_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -7139,11 +7420,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: and_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7152,28 +7434,31 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1032_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB15_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032_DPP-NEXT: .LBB15_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -7186,12 +7471,13 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: and_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7204,40 +7490,43 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1164_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv -; GFX1164_DPP-NEXT: .LBB15_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: .LBB15_2: ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -7253,12 +7542,13 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: and_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7270,29 +7560,32 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1132_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132_DPP-NEXT: .LBB15_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -7336,10 +7629,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -7347,8 +7641,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB16_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: .LBB16_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -7383,10 +7677,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -7394,8 +7689,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB16_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: .LBB16_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 @@ -7430,18 +7725,19 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB16_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: .LBB16_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 @@ -7476,9 +7772,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -7486,9 +7783,9 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB16_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: .LBB16_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 @@ -7523,9 +7820,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -7533,9 +7831,9 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB16_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: .LBB16_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 @@ -7574,10 +7872,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 @@ -7585,8 +7883,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB16_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: .LBB16_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 @@ -7622,20 +7920,21 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b64 v[2:3], v4, v[2:3] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB16_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: .LBB16_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 @@ -7666,20 +7965,21 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: and_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 @@ -7697,23 +7997,24 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB16_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB16_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 @@ -7730,20 +8031,21 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: and_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 @@ -7761,22 +8063,23 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s6 ; GFX9_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB16_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB16_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 @@ -7794,13 +8097,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 @@ -7817,50 +8121,53 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 15 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v2, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s11, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s10, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB16_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB16_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 @@ -7877,13 +8184,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 @@ -7900,32 +8208,35 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s7, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 ; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB16_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: .LBB16_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 @@ -7942,6 +8253,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 @@ -7949,7 +8261,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -7974,53 +8286,56 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 15 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v2, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s11, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s10, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164_DPP-NEXT: .LBB16_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 @@ -8039,6 +8354,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: and_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 @@ -8046,7 +8362,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -8067,34 +8383,36 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v2, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s7, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 ; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv -; GFX1132_DPP-NEXT: .LBB16_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: .LBB16_2: ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 @@ -8139,18 +8457,19 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB17_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: .LBB17_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -8180,18 +8499,19 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB17_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: .LBB17_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -8221,17 +8541,18 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX9_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB17_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: .LBB17_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -8261,18 +8582,19 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB17_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: .LBB17_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -8302,18 +8624,19 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB17_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: .LBB17_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -8348,18 +8671,18 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB17_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: .LBB17_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -8392,19 +8715,20 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b32 v1, v1, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB17_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: .LBB17_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -8432,6 +8756,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: or_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 @@ -8439,7 +8764,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 @@ -8452,21 +8777,22 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB17_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB17_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB17_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -8479,6 +8805,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: or_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 @@ -8486,7 +8813,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 @@ -8499,20 +8826,21 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB17_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX9_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB17_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB17_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -8525,11 +8853,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: or_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8538,38 +8867,42 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB17_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1064_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB17_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB17_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -8581,11 +8914,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: or_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8594,28 +8928,31 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB17_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX1032_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB17_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032_DPP-NEXT: .LBB17_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -8628,12 +8965,13 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: or_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8646,40 +8984,43 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB17_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1164_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv -; GFX1164_DPP-NEXT: .LBB17_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: .LBB17_2: ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -8695,12 +9036,13 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: or_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8712,28 +9054,31 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB17_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX1132_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132_DPP-NEXT: .LBB17_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -8777,10 +9122,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB18_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -8788,8 +9134,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB18_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: .LBB18_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -8824,10 +9170,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB18_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -8835,8 +9182,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB18_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: .LBB18_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 @@ -8871,18 +9218,19 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB18_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB18_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: .LBB18_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 @@ -8917,9 +9265,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB18_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -8927,9 +9276,9 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB18_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: .LBB18_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 @@ -8964,9 +9313,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB18_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -8974,9 +9324,9 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB18_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: .LBB18_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 @@ -9015,10 +9365,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB18_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 @@ -9026,8 +9376,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB18_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: .LBB18_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 @@ -9063,20 +9413,21 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB18_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b64 v[2:3], v4, v[2:3] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB18_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: .LBB18_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 @@ -9107,20 +9458,21 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: or_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 @@ -9138,23 +9490,24 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB18_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB18_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 @@ -9171,20 +9524,21 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: or_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 @@ -9202,22 +9556,23 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s6 ; GFX9_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB18_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB18_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 @@ -9235,13 +9590,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 @@ -9258,50 +9614,53 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 15 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v2, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s11, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s10, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB18_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB18_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 @@ -9318,13 +9677,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 @@ -9341,32 +9701,35 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s7, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 ; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB18_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: .LBB18_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 @@ -9383,6 +9746,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 @@ -9390,7 +9754,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9415,53 +9779,56 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 15 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v2, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s11, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s10, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164_DPP-NEXT: .LBB18_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 @@ -9480,6 +9847,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: or_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 @@ -9487,7 +9855,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9508,34 +9876,36 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v2, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s7, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 ; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv -; GFX1132_DPP-NEXT: .LBB18_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: .LBB18_2: ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 @@ -9580,18 +9950,19 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB19_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: .LBB19_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -9621,18 +9992,19 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB19_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: .LBB19_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -9662,17 +10034,18 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB19_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: .LBB19_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -9702,18 +10075,19 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB19_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: .LBB19_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -9743,18 +10117,19 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB19_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: .LBB19_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -9789,18 +10164,18 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB19_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: .LBB19_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -9833,19 +10208,20 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b32 v1, v1, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB19_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: .LBB19_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -9873,6 +10249,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: xor_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 @@ -9880,7 +10257,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 @@ -9893,21 +10270,22 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB19_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB19_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB19_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -9920,6 +10298,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: xor_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 @@ -9927,7 +10306,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 @@ -9940,20 +10319,21 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB19_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX9_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB19_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB19_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -9966,11 +10346,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: xor_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9979,38 +10360,42 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB19_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1064_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB19_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB19_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -10022,11 +10407,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: xor_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10035,28 +10421,31 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB19_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX1032_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB19_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032_DPP-NEXT: .LBB19_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -10069,12 +10458,13 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: xor_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10087,40 +10477,43 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB19_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1164_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv -; GFX1164_DPP-NEXT: .LBB19_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: .LBB19_2: ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -10136,12 +10529,13 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: xor_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10153,28 +10547,31 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB19_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX1132_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132_DPP-NEXT: .LBB19_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -10218,10 +10615,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB20_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -10229,8 +10627,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB20_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: .LBB20_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -10265,10 +10663,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB20_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -10276,8 +10675,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB20_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: .LBB20_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 @@ -10312,18 +10711,19 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB20_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB20_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: .LBB20_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 @@ -10358,9 +10758,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB20_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -10368,9 +10769,9 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB20_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: .LBB20_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 @@ -10405,9 +10806,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB20_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -10415,9 +10817,9 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB20_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: .LBB20_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 @@ -10456,10 +10858,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB20_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 @@ -10467,8 +10869,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB20_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: .LBB20_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 @@ -10504,20 +10906,21 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB20_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b64 v[2:3], v4, v[2:3] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB20_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: .LBB20_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 @@ -10548,20 +10951,21 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: xor_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 @@ -10579,23 +10983,24 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB20_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB20_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB20_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 @@ -10612,20 +11017,21 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: xor_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 @@ -10643,22 +11049,23 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB20_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s6 ; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB20_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB20_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 @@ -10676,13 +11083,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 @@ -10699,50 +11107,53 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 15 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v2, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s11, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s10, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB20_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB20_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB20_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 @@ -10759,13 +11170,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 @@ -10782,32 +11194,35 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s7, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB20_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 ; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB20_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: .LBB20_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 @@ -10824,6 +11239,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 @@ -10831,7 +11247,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10856,53 +11272,56 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 15 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v2, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s11, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s10, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB20_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164_DPP-NEXT: .LBB20_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 @@ -10921,6 +11340,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: xor_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 @@ -10928,7 +11348,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10949,34 +11369,36 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v2, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s7, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB20_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 ; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv -; GFX1132_DPP-NEXT: .LBB20_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: .LBB20_2: ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 @@ -11021,18 +11443,19 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB21_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: .LBB21_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -11062,18 +11485,19 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB21_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: .LBB21_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -11103,17 +11527,18 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX9_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB21_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: .LBB21_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -11143,18 +11568,19 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB21_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: .LBB21_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -11184,18 +11610,19 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB21_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: .LBB21_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -11230,18 +11657,18 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB21_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: .LBB21_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -11274,19 +11701,20 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i32 v1, v1, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB21_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: .LBB21_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -11314,16 +11742,17 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: max_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11335,22 +11764,23 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB21_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB21_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB21_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 @@ -11363,16 +11793,17 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: max_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11384,21 +11815,22 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB21_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB21_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB21_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 @@ -11411,11 +11843,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: max_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11424,38 +11857,42 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB21_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1064_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB21_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB21_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -11467,11 +11904,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: max_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11480,28 +11918,31 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB21_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1032_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB21_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032_DPP-NEXT: .LBB21_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -11514,12 +11955,13 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: max_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11532,40 +11974,43 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB21_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1164_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv -; GFX1164_DPP-NEXT: .LBB21_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: .LBB21_2: ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -11581,12 +12026,13 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: max_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11598,29 +12044,32 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB21_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1132_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132_DPP-NEXT: .LBB21_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -11642,12 +12091,14 @@ entry: define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: max_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -11655,8 +12106,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB22_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: .LBB22_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -11679,9 +12130,11 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB22_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -11689,8 +12142,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB22_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB22_2: ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -11713,17 +12166,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB22_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB22_2: ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -11744,11 +12199,13 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: max_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB22_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -11756,9 +12213,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB22_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -11777,10 +12234,12 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: max_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB22_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -11788,9 +12247,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB22_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -11809,12 +12268,14 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: max_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB22_2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -11822,8 +12283,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB22_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: .LBB22_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 @@ -11844,19 +12305,21 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1132-LABEL: max_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB22_2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB22_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: .LBB22_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 @@ -11910,10 +12373,11 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -11921,8 +12385,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB23_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: .LBB23_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -11966,10 +12430,11 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -11977,8 +12442,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB23_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: .LBB23_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 @@ -12022,18 +12487,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB23_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: .LBB23_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 @@ -12075,9 +12541,10 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -12085,9 +12552,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB23_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: .LBB23_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 @@ -12127,9 +12594,10 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -12137,9 +12605,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB23_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: .LBB23_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 @@ -12186,10 +12654,10 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 @@ -12197,8 +12665,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB23_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: .LBB23_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 @@ -12242,20 +12710,21 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i64 v[2:3], v4, v[2:3] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB23_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: .LBB23_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 @@ -12287,20 +12756,21 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: max_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: s_mov_b32 s0, 0 +; GFX8_DPP-NEXT: s_mov_b32 s4, 0 +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: s_brev_b32 s1, 1 +; GFX8_DPP-NEXT: s_brev_b32 s5, 1 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s5 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 @@ -12349,24 +12819,24 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s1, v2, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s0, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX8_DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB23_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB23_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB23_2: ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 @@ -12386,20 +12856,21 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: max_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: s_mov_b32 s0, 0 +; GFX9_DPP-NEXT: s_mov_b32 s4, 0 +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: s_brev_b32 s1, 1 +; GFX9_DPP-NEXT: s_brev_b32 s5, 1 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s5 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 @@ -12448,23 +12919,23 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s1, v2, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s0, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX9_DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB23_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX9_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB23_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB23_2: ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 @@ -12484,20 +12955,21 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-LABEL: max_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: s_mov_b32 s0, 0 -; GFX1064_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1064_DPP-NEXT: s_mov_b32 s4, 0 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_DPP-NEXT: s_brev_b32 s5, 1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s5 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -12539,53 +13011,56 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s7 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB23_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB23_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB23_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -12603,19 +13078,20 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1032_DPP-NEXT: s_brev_b32 s1, 1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -12655,32 +13131,35 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB23_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 ; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB23_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: .LBB23_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -12697,24 +13176,25 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: max_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164_DPP-NEXT: s_mov_b32 s0, 0 -; GFX1164_DPP-NEXT: s_brev_b32 s1, 1 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_mov_b32 s4, 0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: s_brev_b32 s5, 1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s5 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s5 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -12765,57 +13245,60 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s7 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB23_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164_DPP-NEXT: .LBB23_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -12836,22 +13319,23 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1132_DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132_DPP-NEXT: s_brev_b32 s1, 1 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s1 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -12890,32 +13374,34 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB23_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 ; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv -; GFX1132_DPP-NEXT: .LBB23_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: .LBB23_2: ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -12961,18 +13447,19 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB24_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB24_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: .LBB24_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -13002,18 +13489,19 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB24_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB24_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: .LBB24_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -13043,17 +13531,18 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB24_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX9_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB24_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: .LBB24_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -13083,18 +13572,19 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB24_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB24_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: .LBB24_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -13124,18 +13614,19 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB24_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB24_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: .LBB24_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -13170,18 +13661,18 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB24_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB24_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: .LBB24_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -13214,19 +13705,20 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB24_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i32 v1, v1, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB24_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: .LBB24_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -13254,16 +13746,17 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: min_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13275,22 +13768,23 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB24_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB24_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 @@ -13303,16 +13797,17 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: min_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13324,21 +13819,22 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB24_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB24_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 @@ -13351,11 +13847,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: min_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13364,38 +13861,42 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1064_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB24_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB24_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -13407,11 +13908,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: min_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13420,28 +13922,31 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1032_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB24_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032_DPP-NEXT: .LBB24_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -13454,12 +13959,13 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: min_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -13472,40 +13978,43 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1164_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv -; GFX1164_DPP-NEXT: .LBB24_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: .LBB24_2: ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -13521,12 +14030,13 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: min_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -13538,29 +14048,32 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1132_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132_DPP-NEXT: .LBB24_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -13582,12 +14095,14 @@ entry: define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: min_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB25_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB25_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -13595,8 +14110,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB25_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: .LBB25_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -13619,9 +14134,11 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB25_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB25_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -13629,8 +14146,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB25_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB25_2: ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -13653,17 +14170,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB25_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB25_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB25_2: ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -13684,11 +14203,13 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: min_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB25_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB25_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -13696,9 +14217,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB25_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB25_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -13717,10 +14238,12 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: min_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB25_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB25_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -13728,9 +14251,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB25_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB25_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -13749,12 +14272,14 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: min_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB25_2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB25_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -13762,8 +14287,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB25_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: .LBB25_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 @@ -13784,19 +14309,21 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-LABEL: min_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB25_2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB25_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB25_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: .LBB25_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 @@ -13850,10 +14377,11 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB26_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -13861,8 +14389,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB26_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: .LBB26_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -13906,10 +14434,11 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB26_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -13917,8 +14446,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB26_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: .LBB26_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 @@ -13962,18 +14491,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB26_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB26_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: .LBB26_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 @@ -14015,9 +14545,10 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB26_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -14025,9 +14556,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB26_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: .LBB26_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 @@ -14067,9 +14598,10 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB26_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -14077,9 +14609,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB26_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: .LBB26_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 @@ -14126,10 +14658,10 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB26_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 @@ -14137,8 +14669,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB26_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: .LBB26_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 @@ -14182,20 +14714,21 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB26_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i64 v[2:3], v4, v[2:3] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB26_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: .LBB26_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 @@ -14227,6 +14760,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: min_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: s_mov_b32 s6, -1 @@ -14238,7 +14772,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s7 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -14289,23 +14823,25 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s9, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s8, v1, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB26_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s9 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s8 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB26_2: +; GFX8_DPP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB26_2: ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX8_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v10 @@ -14324,6 +14860,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: min_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: s_mov_b32 s6, -1 @@ -14335,7 +14872,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s7 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -14386,22 +14923,24 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s9, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s8, v1, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB26_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s9 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s8 ; GFX9_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB26_2: +; GFX9_DPP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB26_2: ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v10 @@ -14420,20 +14959,21 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-LABEL: min_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -14475,52 +15015,55 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s7 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s8 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s10, v4, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s11, v3, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s10, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s11, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v4, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s12, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s10, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s12, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB26_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s8 +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB26_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB26_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -14537,6 +15080,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-LABEL: min_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_brev_b32 s7, -2 @@ -14550,7 +15094,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s7 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -14590,31 +15134,34 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s8, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s8, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB26_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 ; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB26_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: .LBB26_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -14631,16 +15178,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: min_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1164_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 @@ -14648,7 +15196,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s6 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s7 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -14699,57 +15247,59 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s7 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s8 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s10, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s11, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s12, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s10, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s12, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB26_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s8 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164_DPP-NEXT: .LBB26_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -14769,6 +15319,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: min_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1132_DPP-NEXT: s_brev_b32 s7, -2 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -14785,7 +15336,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s6 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s7 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -14824,31 +15375,33 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s8, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB26_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 ; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv -; GFX1132_DPP-NEXT: .LBB26_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: .LBB26_2: ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -14894,18 +15447,19 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB27_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: .LBB27_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -14935,18 +15489,19 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB27_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: .LBB27_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -14976,17 +15531,18 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX9_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB27_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: .LBB27_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -15016,18 +15572,19 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB27_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: .LBB27_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -15057,18 +15614,19 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB27_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: .LBB27_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -15103,18 +15661,18 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB27_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: .LBB27_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -15147,19 +15705,20 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u32 v1, v1, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB27_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: .LBB27_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -15187,6 +15746,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: umax_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 @@ -15194,7 +15754,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 @@ -15207,21 +15767,22 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB27_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB27_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB27_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -15234,6 +15795,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: umax_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 @@ -15241,7 +15803,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 @@ -15254,20 +15816,21 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB27_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX9_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB27_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB27_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 @@ -15280,11 +15843,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: umax_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -15293,38 +15857,42 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB27_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1064_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB27_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB27_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -15336,11 +15904,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: umax_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -15349,28 +15918,31 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB27_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX1032_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB27_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032_DPP-NEXT: .LBB27_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -15383,12 +15955,13 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: umax_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -15401,40 +15974,43 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB27_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1164_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv -; GFX1164_DPP-NEXT: .LBB27_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: .LBB27_2: ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -15450,12 +16026,13 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: umax_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -15467,28 +16044,31 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB27_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX1132_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132_DPP-NEXT: .LBB27_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -15510,12 +16090,14 @@ entry: define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: umax_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB28_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -15523,8 +16105,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB28_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: .LBB28_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -15546,9 +16128,11 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB28_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -15556,8 +16140,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB28_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB28_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 @@ -15579,17 +16163,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB28_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB28_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 @@ -15609,11 +16195,13 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: umax_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB28_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -15621,9 +16209,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB28_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB28_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -15642,10 +16230,12 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: umax_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB28_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -15653,9 +16243,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB28_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB28_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -15674,12 +16264,14 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: umax_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB28_2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -15687,8 +16279,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB28_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: .LBB28_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 @@ -15709,19 +16301,21 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-LABEL: umax_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB28_2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB28_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: .LBB28_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 @@ -15774,10 +16368,11 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -15785,8 +16380,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB29_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: .LBB29_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -15829,10 +16424,11 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -15840,8 +16436,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB29_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: .LBB29_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 @@ -15884,18 +16480,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB29_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: .LBB29_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 @@ -15936,9 +16533,10 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -15946,9 +16544,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB29_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: .LBB29_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 @@ -15987,9 +16585,10 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -15997,9 +16596,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB29_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: .LBB29_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 @@ -16045,10 +16644,10 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 @@ -16056,8 +16655,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB29_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: .LBB29_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 @@ -16100,20 +16699,21 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u64 v[2:3], v4, v[2:3] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB29_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: .LBB29_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 @@ -16145,20 +16745,21 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: umax_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -16207,23 +16808,24 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB29_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB29_2: ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 @@ -16243,20 +16845,21 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: umax_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -16305,22 +16908,23 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s6 ; GFX9_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB29_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB29_2: ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 @@ -16339,11 +16943,12 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: umax_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 @@ -16351,7 +16956,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -16393,53 +16998,56 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s7 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB29_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB29_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -16455,19 +17063,20 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: umax_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -16507,32 +17116,35 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 ; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB29_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: .LBB29_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -16548,15 +17160,16 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: umax_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 @@ -16564,7 +17177,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -16615,57 +17228,60 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s7 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164_DPP-NEXT: .LBB29_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -16684,6 +17300,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_DPP-LABEL: umax_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -16699,7 +17316,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -16738,32 +17355,34 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 ; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv -; GFX1132_DPP-NEXT: .LBB29_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: .LBB29_2: ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -16809,18 +17428,19 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB30_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB30_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: .LBB30_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -16850,18 +17470,19 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB30_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB30_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: .LBB30_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -16891,17 +17512,18 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB30_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX9_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB30_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: .LBB30_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -16931,18 +17553,19 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB30_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB30_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: .LBB30_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -16972,18 +17595,19 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB30_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB30_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: .LBB30_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 @@ -17018,18 +17642,18 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB30_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB30_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: .LBB30_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -17062,19 +17686,20 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB30_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u32 v1, v1, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB30_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: .LBB30_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -17102,16 +17727,17 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: umin_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -17123,22 +17749,23 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB30_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB30_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB30_2: ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 @@ -17151,16 +17778,17 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: umin_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -17172,21 +17800,22 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB30_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB30_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB30_2: ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 @@ -17199,11 +17828,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: umin_i32_varying: ; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -17212,38 +17842,42 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB30_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1064_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB30_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB30_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -17255,11 +17889,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: umin_i32_varying: ; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -17268,28 +17903,31 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB30_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1032_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB30_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032_DPP-NEXT: .LBB30_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -17302,12 +17940,13 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-LABEL: umin_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -17320,40 +17959,43 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB30_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s4, s7 ; GFX1164_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv -; GFX1164_DPP-NEXT: .LBB30_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: .LBB30_2: ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -17369,12 +18011,13 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-LABEL: umin_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -17386,29 +18029,32 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s1 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB30_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1132_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132_DPP-NEXT: .LBB30_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -17430,12 +18076,14 @@ entry: define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: umin_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB31_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB31_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -17443,8 +18091,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB31_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: .LBB31_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -17466,9 +18114,11 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB31_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB31_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -17476,8 +18126,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB31_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB31_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 @@ -17499,17 +18149,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB31_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB31_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB31_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB31_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 @@ -17529,11 +18181,13 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: umin_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB31_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB31_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -17541,9 +18195,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB31_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: .LBB31_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -17562,10 +18216,12 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: umin_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB31_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB31_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -17573,9 +18229,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB31_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: .LBB31_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -17594,12 +18250,14 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: umin_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB31_2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB31_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -17607,8 +18265,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB31_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: .LBB31_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 @@ -17629,19 +18287,21 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-LABEL: umin_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB31_2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB31_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB31_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: .LBB31_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 @@ -17694,10 +18354,11 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX7LESS_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_scc0 .LBB32_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -17705,8 +18366,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS_ITERATIVE-NEXT: .LBB32_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: .LBB32_4: ; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -17749,10 +18410,11 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX8_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX8_ITERATIVE-NEXT: s_cbranch_scc0 .LBB32_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -17760,8 +18422,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_ITERATIVE-NEXT: .LBB32_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: .LBB32_4: ; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 @@ -17804,18 +18466,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX9_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX9_ITERATIVE-NEXT: s_cbranch_scc0 .LBB32_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: .LBB32_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: .LBB32_4: ; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 @@ -17856,9 +18519,10 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc0 .LBB32_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -17866,9 +18530,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1064_ITERATIVE-NEXT: .LBB32_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: .LBB32_4: ; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 @@ -17907,9 +18571,10 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc0 .LBB32_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -17917,9 +18582,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1032_ITERATIVE-NEXT: .LBB32_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: .LBB32_4: ; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 @@ -17965,10 +18630,10 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164_ITERATIVE-NEXT: s_cmov_b64 exec, vcc +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc0 .LBB32_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 @@ -17976,8 +18641,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1164_ITERATIVE-NEXT: .LBB32_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: .LBB32_4: ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 @@ -18020,20 +18685,21 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc0 .LBB32_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u64 v[2:3], v4, v[2:3] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv -; GFX1132_ITERATIVE-NEXT: .LBB32_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: .LBB32_4: ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 @@ -18065,20 +18731,21 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: umin_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -18127,23 +18794,24 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX8_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX8_DPP-NEXT: s_cbranch_scc0 .LBB32_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: .LBB32_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: .LBB32_2: ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 @@ -18163,20 +18831,21 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: umin_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -18225,22 +18894,23 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX9_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9_DPP-NEXT: s_cbranch_scc0 .LBB32_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s6 ; GFX9_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: .LBB32_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: .LBB32_2: ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 @@ -18259,11 +18929,12 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: umin_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 @@ -18271,7 +18942,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -18313,53 +18984,56 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s7 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX1064_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064_DPP-NEXT: s_cbranch_scc0 .LBB32_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv -; GFX1064_DPP-NEXT: .LBB32_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: .LBB32_2: ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -18375,19 +19049,20 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: umin_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -18427,32 +19102,35 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX1032_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_scc0 .LBB32_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 ; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv -; GFX1032_DPP-NEXT: .LBB32_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: .LBB32_2: ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -18468,15 +19146,16 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: umin_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 @@ -18484,7 +19163,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -18535,57 +19214,60 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s6 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s7 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX1164_DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164_DPP-NEXT: s_cbranch_scc0 .LBB32_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164_DPP-NEXT: .LBB32_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 @@ -18604,6 +19286,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_DPP-LABEL: umin_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 @@ -18619,7 +19302,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -18658,32 +19341,34 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s5, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s5 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX1132_DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_scc0 .LBB32_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 ; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv -; GFX1132_DPP-NEXT: .LBB32_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: .LBB32_2: ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index f67fcd6e0caf53..7524ed201de167 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -16,30 +16,33 @@ declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspace(8) inreg %inout) { ; GFX7-LABEL: add_i32_constant: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[10:11], exec +; GFX7-NEXT: s_mov_b64 s[8:9], exec +; GFX7-NEXT: s_and_b64 s[10:11], exec, exec ; GFX7-NEXT: ; implicit-def: $vgpr0 -; GFX7-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX7-NEXT: s_cbranch_execz .LBB0_4 +; GFX7-NEXT: s_cmov_b64 exec, s[10:11] +; GFX7-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: s_mov_b64 s[10:11], exec ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s13, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: s_and_b64 s[14:15], vcc, -1 ; GFX7-NEXT: ; implicit-def: $vgpr1 -; GFX7-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX7-NEXT: s_cbranch_execz .LBB0_3 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_bcnt1_i32_b64 s12, s[12:13] ; GFX7-NEXT: s_mul_i32 s12, s12, 5 ; GFX7-NEXT: v_mov_b32_e32 v1, s12 ; GFX7-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX7-NEXT: .LBB0_3: ; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: .LBB0_3: ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s4, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX7-NEXT: .LBB0_4: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: .LBB0_4: ; %Flow ; GFX7-NEXT: s_wqm_b64 s[4:5], -1 ; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] ; GFX7-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -51,30 +54,33 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; ; GFX89-LABEL: add_i32_constant: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_mov_b64 s[10:11], exec +; GFX89-NEXT: s_mov_b64 s[8:9], exec +; GFX89-NEXT: s_and_b64 s[10:11], exec, exec ; GFX89-NEXT: ; implicit-def: $vgpr0 -; GFX89-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX89-NEXT: s_cbranch_execz .LBB0_4 +; GFX89-NEXT: s_cmov_b64 exec, s[10:11] +; GFX89-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX89-NEXT: ; %bb.1: ; GFX89-NEXT: s_mov_b64 s[12:13], exec ; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 ; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 ; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX89-NEXT: s_mov_b64 s[10:11], exec +; GFX89-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX89-NEXT: ; implicit-def: $vgpr1 -; GFX89-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX89-NEXT: s_cbranch_execz .LBB0_3 +; GFX89-NEXT: s_cmov_b64 exec, vcc +; GFX89-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX89-NEXT: ; %bb.2: ; GFX89-NEXT: s_bcnt1_i32_b64 s12, s[12:13] ; GFX89-NEXT: s_mul_i32 s12, s12, 5 ; GFX89-NEXT: v_mov_b32_e32 v1, s12 ; GFX89-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX89-NEXT: .LBB0_3: ; GFX89-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX89-NEXT: .LBB0_3: ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_readfirstlane_b32 s4, v1 ; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX89-NEXT: .LBB0_4: ; %Flow ; GFX89-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX89-NEXT: .LBB0_4: ; %Flow ; GFX89-NEXT: s_wqm_b64 s[4:5], -1 ; GFX89-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] ; GFX89-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -86,31 +92,34 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[10:11], exec +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_and_b64 s[10:11], exec, exec ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX1064-NEXT: s_cbranch_execz .LBB0_4 +; GFX1064-NEXT: s_cmov_b64 exec, s[10:11] +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_mov_b64 s[12:13], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-NEXT: ; %bb.2: ; GFX1064-NEXT: s_bcnt1_i32_b64 s12, s[12:13] ; GFX1064-NEXT: s_mul_i32 s12, s12, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, s12 ; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX1064-NEXT: .LBB0_3: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1064-NEXT: .LBB0_3: ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX1064-NEXT: .LBB0_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: .LBB0_4: ; %Flow ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 ; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] ; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -122,30 +131,33 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s9, exec_lo +; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_and_b32 s9, exec_lo, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 -; GFX1032-NEXT: s_cbranch_execz .LBB0_4 +; GFX1032-NEXT: s_cmov_b32 exec_lo, s9 +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_mov_b32 s10, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-NEXT: ; %bb.2: ; GFX1032-NEXT: s_bcnt1_i32_b32 s10, s10 ; GFX1032-NEXT: s_mul_i32 s10, s10, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, s10 ; GFX1032-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX1032-NEXT: .LBB0_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: .LBB0_4: ; %Flow ; GFX1032-NEXT: s_wqm_b32 s4, -1 ; GFX1032-NEXT: s_and_b32 s4, s4, s4 ; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 @@ -157,11 +169,12 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; ; GFX1164-LABEL: add_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[10:11], exec +; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_and_b64 s[10:11], exec, exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX1164-NEXT: s_cbranch_execz .LBB0_4 +; GFX1164-NEXT: s_cmov_b64 exec, s[10:11] +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_mov_b64 s[12:13], exec ; GFX1164-NEXT: s_mov_b64 s[10:11], exec @@ -169,22 +182,24 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1164-NEXT: ; %bb.2: ; GFX1164-NEXT: s_bcnt1_i32_b64 s12, s[12:13] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_mul_i32 s12, s12, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, s12 ; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc -; GFX1164-NEXT: .LBB0_3: ; GFX1164-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1164-NEXT: .LBB0_3: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX1164-NEXT: .LBB0_4: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: .LBB0_4: ; %Flow ; GFX1164-NEXT: s_wqm_b64 s[4:5], -1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] @@ -199,33 +214,36 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; ; GFX1132-LABEL: add_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s9, exec_lo +; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_and_b32 s9, exec_lo, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_saveexec_b32 s8, s9 -; GFX1132-NEXT: s_cbranch_execz .LBB0_4 +; GFX1132-NEXT: s_cmov_b32 exec_lo, s9 +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_mov_b32 s10, exec_lo ; GFX1132-NEXT: s_mov_b32 s9, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1132-NEXT: ; %bb.2: ; GFX1132-NEXT: s_bcnt1_i32_b32 s10, s10 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_mul_i32 s10, s10, 5 ; GFX1132-NEXT: v_mov_b32_e32 v1, s10 ; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc -; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX1132-NEXT: .LBB0_4: ; %Flow ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: .LBB0_4: ; %Flow ; GFX1132-NEXT: s_wqm_b32 s4, -1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_b32 s4, s4, s4 @@ -266,22 +284,24 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b64 s[10:11], exec ; GFX8-NEXT: s_mov_b64 s[8:9], exec -; GFX8-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX8-NEXT: s_and_b64 s[10:11], s[10:11], exec ; GFX8-NEXT: ; implicit-def: $vgpr3 -; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX8-NEXT: s_cbranch_execz .LBB1_4 +; GFX8-NEXT: s_cmov_b64 exec, s[10:11] +; GFX8-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX8-NEXT: s_mov_b64 s[10:11], exec +; GFX8-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[10:11] +; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -293,25 +313,26 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s12, v2, 63 +; GFX8-NEXT: v_readlane_b32 s14, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[10:11] +; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_3 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX8-NEXT: .LBB1_3: ; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: .LBB1_3: ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v0 -; GFX8-NEXT: .LBB1_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: .LBB1_4: ; %Flow ; GFX8-NEXT: s_wqm_b64 s[4:5], -1 ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] ; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -323,22 +344,24 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b64 s[10:11], exec ; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-NEXT: s_and_b64 s[10:11], s[10:11], exec ; GFX9-NEXT: ; implicit-def: $vgpr3 -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX9-NEXT: s_cbranch_execz .LBB1_4 +; GFX9-NEXT: s_cmov_b64 exec, s[10:11] +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-NEXT: s_mov_b64 s[10:11], exec +; GFX9-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -350,25 +373,26 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s12, v2, 63 +; GFX9-NEXT: v_readlane_b32 s14, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX9-NEXT: ; %bb.2: -; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v0, s14 ; GFX9-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX9-NEXT: .LBB1_3: ; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX9-NEXT: .LBB1_3: ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_add_u32_e32 v3, s4, v0 -; GFX9-NEXT: .LBB1_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: .LBB1_4: ; %Flow ; GFX9-NEXT: s_wqm_b64 s[4:5], -1 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -380,17 +404,19 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_and_b64 s[10:11], s[10:11], exec ; GFX1064-NEXT: ; implicit-def: $vgpr4 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX1064-NEXT: s_cbranch_execz .LBB1_4 +; GFX1064-NEXT: s_cmov_b64 exec, s[10:11] +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX1064-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -399,40 +425,43 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1064-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s12, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s12 +; GFX1064-NEXT: v_readlane_b32 s14, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s14 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s12, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s13, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s12, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[10:11] +; GFX1064-NEXT: v_readlane_b32 s14, v1, 15 +; GFX1064-NEXT: s_mov_b64 exec, s[12:13] ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1064-NEXT: v_readlane_b32 s12, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s14, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s13, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[10:11] +; GFX1064-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX1064-NEXT: v_readlane_b32 s15, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s14, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[12:13] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s14, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[10:11] +; GFX1064-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX1064-NEXT: v_writelane_b32 v3, s15, 32 +; GFX1064-NEXT: v_readlane_b32 s15, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s14, v1, 47 +; GFX1064-NEXT: s_mov_b64 exec, s[12:13] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX1064-NEXT: v_writelane_b32 v3, s14, 48 +; GFX1064-NEXT: s_mov_b64 exec, s[12:13] +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1064-NEXT: ; %bb.2: -; GFX1064-NEXT: v_mov_b32_e32 v0, s12 +; GFX1064-NEXT: v_mov_b32_e32 v0, s15 ; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX1064-NEXT: .LBB1_3: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1064-NEXT: .LBB1_3: ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_add_nc_u32_e32 v4, s4, v0 -; GFX1064-NEXT: .LBB1_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: .LBB1_4: ; %Flow ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 ; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] ; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -444,17 +473,19 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_and_b32 s9, s9, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr4 -; GFX1032-NEXT: s_mov_b32 s9, s8 -; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 -; GFX1032-NEXT: s_cbranch_execz .LBB1_4 +; GFX1032-NEXT: s_cmov_b32 exec_lo, s9 +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 +; GFX1032-NEXT: s_or_saveexec_b32 s10, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -463,30 +494,33 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s11, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s10, v1, 15 -; GFX1032-NEXT: s_mov_b32 exec_lo, s9 +; GFX1032-NEXT: s_mov_b32 exec_lo, s10 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s10, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s9 +; GFX1032-NEXT: s_or_saveexec_b32 s10, -1 +; GFX1032-NEXT: v_readlane_b32 s12, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s11, v1, 15 +; GFX1032-NEXT: s_mov_b32 exec_lo, s10 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_or_saveexec_b32 s10, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s11, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s10 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1032-NEXT: ; %bb.2: -; GFX1032-NEXT: v_mov_b32_e32 v0, s11 +; GFX1032-NEXT: v_mov_b32_e32 v0, s12 ; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX1032-NEXT: .LBB1_3: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX1032-NEXT: .LBB1_3: ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_add_nc_u32_e32 v4, s4, v0 -; GFX1032-NEXT: .LBB1_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: .LBB1_4: ; %Flow ; GFX1032-NEXT: s_wqm_b32 s4, -1 ; GFX1032-NEXT: s_and_b32 s4, s4, s4 ; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 @@ -498,18 +532,20 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_and_b64 s[10:11], s[10:11], exec ; GFX1164-NEXT: ; implicit-def: $vgpr4 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX1164-NEXT: s_cbranch_execz .LBB1_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmov_b64 exec, s[10:11] +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX1164-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -522,44 +558,47 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s12, v1, 31 +; GFX1164-NEXT: v_readlane_b32 s14, v1, 31 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s12 +; GFX1164-NEXT: v_mov_b32_e32 v2, s14 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s12, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s13, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s12, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[10:11] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s14, v1, 15 +; GFX1164-NEXT: s_mov_b64 exec, s[12:13] ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1164-NEXT: v_readlane_b32 s12, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s14, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s13, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[10:11] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX1164-NEXT: v_readlane_b32 s15, v1, 31 +; GFX1164-NEXT: v_writelane_b32 v3, s14, 16 +; GFX1164-NEXT: s_mov_b64 exec, s[12:13] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s14, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[10:11] +; GFX1164-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX1164-NEXT: v_writelane_b32 v3, s15, 32 +; GFX1164-NEXT: v_readlane_b32 s15, v1, 63 +; GFX1164-NEXT: v_readlane_b32 s14, v1, 47 +; GFX1164-NEXT: s_mov_b64 exec, s[12:13] ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_writelane_b32 v3, s14, 48 +; GFX1164-NEXT: s_mov_b64 exec, s[12:13] +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB1_3 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1164-NEXT: ; %bb.2: -; GFX1164-NEXT: v_mov_b32_e32 v0, s12 +; GFX1164-NEXT: v_mov_b32_e32 v0, s15 ; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc -; GFX1164-NEXT: .LBB1_3: ; GFX1164-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1164-NEXT: .LBB1_3: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_e32 v4, s4, v0 -; GFX1164-NEXT: .LBB1_4: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: .LBB1_4: ; %Flow ; GFX1164-NEXT: s_wqm_b64 s[4:5], -1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] @@ -574,18 +613,20 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_mov_b32 s9, exec_lo ; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_and_b32 s9, s9, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_mov_b32 s9, s8 -; GFX1132-NEXT: s_and_saveexec_b32 s8, s9 -; GFX1132-NEXT: s_cbranch_execz .LBB1_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cmov_b32 exec_lo, s9 +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_mov_b32 s9, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s9, -1 +; GFX1132-NEXT: s_or_saveexec_b32 s10, -1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -596,34 +637,37 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s11, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s10, v1, 15 -; GFX1132-NEXT: s_mov_b32 exec_lo, s9 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_mov_b32 exec_lo, s10 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s9, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s10, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s9 +; GFX1132-NEXT: s_or_saveexec_b32 s10, -1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s12, v1, 31 +; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: v_readlane_b32 s11, v1, 15 +; GFX1132-NEXT: s_mov_b32 exec_lo, s10 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_or_saveexec_b32 s10, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_writelane_b32 v3, s11, 16 +; GFX1132-NEXT: s_mov_b32 exec_lo, s10 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s9, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB1_3 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1132-NEXT: ; %bb.2: -; GFX1132-NEXT: v_mov_b32_e32 v0, s11 +; GFX1132-NEXT: v_mov_b32_e32 v0, s12 ; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc -; GFX1132-NEXT: .LBB1_3: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX1132-NEXT: .LBB1_3: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_e32 v4, s4, v0 -; GFX1132-NEXT: .LBB1_4: ; %Flow ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: .LBB1_4: ; %Flow ; GFX1132-NEXT: s_wqm_b32 s4, -1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_b32 s4, s4, s4 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 3a2efadac067d0..ddd3e57e7e43c5 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -19,12 +19,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB0_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -32,8 +34,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -50,9 +52,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -60,8 +64,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -78,9 +82,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -88,8 +94,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -102,12 +108,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-LABEL: add_i32_constant: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -115,9 +123,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -131,11 +139,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -143,9 +153,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -164,8 +174,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -174,8 +186,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -195,8 +207,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -205,8 +219,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -227,8 +241,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -237,8 +253,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -258,8 +274,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -268,8 +286,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -291,13 +309,15 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB1_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -305,8 +325,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -320,14 +340,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -335,8 +357,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -350,14 +372,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -365,8 +389,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -381,12 +405,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -394,9 +420,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -411,11 +437,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -423,9 +451,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -445,8 +473,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -455,8 +485,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -477,8 +507,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -487,8 +519,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -510,8 +542,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -520,8 +554,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -542,8 +576,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -552,8 +588,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -593,17 +629,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX6-NEXT: s_cbranch_execz .LBB2_4 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX6-NEXT: ; %bb.3: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB2_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB2_4: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -635,17 +672,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -676,17 +714,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -716,17 +755,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -756,17 +796,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -801,17 +842,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -844,19 +885,20 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -892,17 +934,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX12W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -935,19 +977,20 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX12W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1089,12 +1132,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB4_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1102,8 +1147,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB4_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB4_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1121,9 +1166,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB4_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1131,8 +1178,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -1150,9 +1197,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1160,8 +1209,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -1175,12 +1224,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-LABEL: sub_i32_constant: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB4_2 +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1188,9 +1239,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB4_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB4_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -1205,11 +1256,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB4_2 +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -1217,9 +1270,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB4_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10W32-NEXT: .LBB4_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -1239,8 +1292,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB4_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1249,8 +1304,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB4_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB4_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1271,8 +1326,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB4_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -1281,8 +1338,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB4_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11W32-NEXT: .LBB4_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1304,8 +1361,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB4_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1314,8 +1373,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB4_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB4_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1336,8 +1395,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB4_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -1346,8 +1407,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB4_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12W32-NEXT: .LBB4_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1370,13 +1431,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB5_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1384,8 +1447,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1399,14 +1462,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB5_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1414,8 +1479,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1429,14 +1494,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1444,8 +1511,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1460,12 +1527,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1473,9 +1542,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1490,11 +1559,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1502,9 +1573,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -1524,8 +1595,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1534,8 +1607,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1557,8 +1630,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1567,8 +1642,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -1591,8 +1666,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1601,8 +1678,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1624,8 +1701,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1634,8 +1713,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -1676,17 +1755,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX6-NEXT: s_cbranch_execz .LBB6_4 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX6-NEXT: ; %bb.3: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB6_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB6_4: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1718,17 +1798,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB6_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB6_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB6_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -1759,17 +1840,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB6_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB6_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -1799,17 +1881,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX10W64-NEXT: s_cbranch_execz .LBB6_4 +; GFX10W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB6_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB6_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -1839,17 +1922,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10W32-NEXT: s_cbranch_execz .LBB6_4 +; GFX10W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB6_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: .LBB6_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -1884,17 +1968,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX11W64-NEXT: s_cbranch_execz .LBB6_4 +; GFX11W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB6_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB6_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1927,19 +2011,20 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX11W32-NEXT: s_cbranch_execz .LBB6_4 +; GFX11W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB6_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: .LBB6_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1976,17 +2061,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX12W64-NEXT: s_cbranch_execz .LBB6_4 +; GFX12W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB6_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB6_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -2019,19 +2104,20 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX12W32-NEXT: s_cbranch_execz .LBB6_4 +; GFX12W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB6_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: .LBB6_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index d0c0b62c78e42b..39bb8adf09f0b9 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -19,12 +19,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB0_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -33,8 +35,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -51,9 +53,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -62,8 +66,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -80,9 +84,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -91,8 +97,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -105,12 +111,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-LABEL: add_i32_constant: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -119,9 +127,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -135,11 +143,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -148,9 +158,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -169,8 +179,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -180,8 +192,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -201,8 +213,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -212,8 +226,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -234,8 +248,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -245,8 +261,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -266,8 +282,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -276,8 +294,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -299,13 +317,15 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB1_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -314,8 +334,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -329,14 +349,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -345,8 +367,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -360,14 +382,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -376,8 +400,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -392,12 +416,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -406,9 +432,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -423,11 +449,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -436,9 +464,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -458,8 +486,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -469,8 +499,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -491,8 +521,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -502,8 +534,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -525,8 +557,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -536,8 +570,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -558,8 +592,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -568,8 +604,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -609,18 +645,19 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX6-NEXT: s_cbranch_execz .LBB2_4 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX6-NEXT: ; %bb.3: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX6-NEXT: .LBB2_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB2_4: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -652,18 +689,19 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -694,18 +732,19 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -735,18 +774,19 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -776,18 +816,19 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -822,18 +863,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -866,19 +907,20 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -914,18 +956,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX12W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -958,19 +1000,20 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX12W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1246,12 +1289,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB5_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1260,8 +1305,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1279,9 +1324,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB5_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1290,8 +1337,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -1309,9 +1356,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1320,8 +1369,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -1335,12 +1384,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-LABEL: sub_i32_constant: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1349,9 +1400,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -1366,11 +1417,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -1379,9 +1432,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -1401,8 +1454,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1412,8 +1467,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1434,8 +1489,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -1445,8 +1502,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1468,8 +1525,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1479,8 +1538,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1501,8 +1560,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -1511,8 +1572,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1535,13 +1596,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB6_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1550,8 +1613,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX6-NEXT: .LBB6_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB6_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1565,14 +1628,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB6_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1581,8 +1646,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1596,14 +1661,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1612,8 +1679,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1628,12 +1695,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[0:1], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1642,9 +1711,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1659,11 +1728,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1672,9 +1743,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -1694,8 +1765,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1705,8 +1778,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB6_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB6_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1728,8 +1801,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1739,8 +1814,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W32-NEXT: .LBB6_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: .LBB6_2: ; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -1763,8 +1838,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1774,8 +1851,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB6_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB6_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1797,8 +1874,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1807,8 +1886,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB6_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: .LBB6_2: ; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -1849,18 +1928,19 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX6-NEXT: s_cbranch_execz .LBB7_4 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX6-NEXT: ; %bb.3: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc -; GFX6-NEXT: .LBB7_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: .LBB7_4: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1892,18 +1972,19 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB7_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -1934,18 +2015,19 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -1975,18 +2057,19 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX10W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: s_mov_b32 null, 0 @@ -2016,18 +2099,19 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, v2, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 @@ -2062,18 +2146,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX11W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -2106,19 +2190,20 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX11W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB7_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11W32-NEXT: .LBB7_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -2155,18 +2240,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX12W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX12W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB7_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12W64-NEXT: .LBB7_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -2199,19 +2284,20 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX12W32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX12W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB7_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12W32-NEXT: .LBB7_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index 4f0bc512565d13..10df9357f81d4d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -21,10 +21,10 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -33,17 +33,19 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB0_6 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB0_6 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB0_3 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc @@ -52,9 +54,12 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: .LBB0_3: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB0_5 +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB0_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -62,20 +67,23 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: .LBB0_5: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: .LBB0_6: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB0_8 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB0_8 ; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB0_8: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB0_8: ; %atomicrmw.end ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -131,10 +139,10 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -143,24 +151,29 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB1_6 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_6 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB1_3 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: .LBB1_3: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB1_5 +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -168,21 +181,24 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: .LBB1_5: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: .LBB1_6: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB1_8 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_8 ; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -226,48 +242,51 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_3 -; GFX908-NEXT: ; %bb.1: ; %Flow2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_8 -; GFX908-NEXT: .LBB2_2: ; %atomicrmw.phi -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; GFX908-NEXT: .LBB2_3: ; %atomicrmw.check.private +; GFX908-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB2_6 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB2_5 -; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX908-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB2_3 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: .LBB2_5: ; %Flow -; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB2_7 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB2_3: ; %Flow +; GFX908-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX908-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[6:7] +; GFX908-NEXT: s_cbranch_scc0 .LBB2_5 +; GFX908-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX908-NEXT: .LBB2_7: ; %Flow1 -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: .LBB2_5: ; %Flow1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB2_2 -; GFX908-NEXT: .LBB2_8: ; %atomicrmw.shared +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB2_6: ; %Flow2 +; GFX908-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[4:5] +; GFX908-NEXT: s_cbranch_scc0 .LBB2_8 +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB2_8: ; %atomicrmw.phi ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -276,48 +295,51 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_8 -; GFX90A-NEXT: .LBB2_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB2_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB2_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB2_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB2_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB2_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB2_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB2_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: .LBB2_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB2_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB2_2 -; GFX90A-NEXT: .LBB2_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB2_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB2_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB2_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -370,10 +392,10 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -391,10 +413,10 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -422,11 +444,11 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX1100-NEXT: buffer_gl0_inv ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX1100-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1100-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1100-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1100-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1100-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1100-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1100-NEXT: v_mov_b32_e32 v0, v3 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll index f9a43dd61c8cfb..5316c04e31e057 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -17,10 +17,10 @@ define i32 @atomic_nand_i32_lds(ptr addrspace(3) %ptr) nounwind { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB0_1 +; GCN-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN-NEXT: s_cbranch_scc1 .LBB0_1 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(3) %ptr, i32 4 seq_cst @@ -44,10 +44,10 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB1_1 +; GCN-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN-NEXT: s_cbranch_scc1 .LBB1_1 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst @@ -71,10 +71,10 @@ define i32 @atomic_nand_i32_flat(ptr %ptr) nounwind { ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_1 +; GCN-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN-NEXT: s_cbranch_scc1 .LBB2_1 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr %ptr, i32 4 seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll index 3ed2cb856eaea8..885d5d6e66617e 100644 --- a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll @@ -18,56 +18,60 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) { ; REGALLOC-NEXT: renamable $sgpr6 = IMPLICIT_DEF ; REGALLOC-NEXT: renamable $vgpr1 = COPY killed renamable $sgpr6 ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = COPY $exec, implicit-def $exec - ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = S_XOR_B64 renamable $sgpr4_sgpr5, killed renamable $sgpr6_sgpr7, implicit-def dead $scc + ; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = S_XOR_B64 renamable $sgpr4_sgpr5, $exec, implicit-def dead $scc ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr6, 0, $vgpr0, implicit-def $sgpr6_sgpr7, implicit $sgpr6_sgpr7 ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 1, $vgpr0, implicit killed $sgpr6_sgpr7 ; REGALLOC-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; REGALLOC-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr4_sgpr5 - ; REGALLOC-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; REGALLOC-NEXT: S_BRANCH %bb.3 + ; REGALLOC-NEXT: S_CMP_LG_U64_term renamable $sgpr4_sgpr5, 0, implicit-def $scc + ; REGALLOC-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr4_sgpr5, implicit $scc + ; REGALLOC-NEXT: S_CBRANCH_SCC1 %bb.3, implicit killed $scc + ; REGALLOC-NEXT: S_BRANCH %bb.1 ; REGALLOC-NEXT: {{ $}} ; REGALLOC-NEXT: bb.1.Flow: ; REGALLOC-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; REGALLOC-NEXT: {{ $}} ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; REGALLOC-NEXT: $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr4_sgpr5 ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 - ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; REGALLOC-NEXT: $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) - ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 2, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr5, 3, $vgpr0, implicit $sgpr4_sgpr5 + ; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = S_XOR_B64 renamable $sgpr4_sgpr5, $exec, implicit-def dead $scc + ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr6, 2, $vgpr0, implicit-def $sgpr6_sgpr7, implicit $sgpr6_sgpr7 + ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 3, $vgpr0, implicit killed $sgpr6_sgpr7 ; REGALLOC-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; REGALLOC-NEXT: $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; REGALLOC-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec - ; REGALLOC-NEXT: S_BRANCH %bb.2 + ; REGALLOC-NEXT: S_CMP_LG_U64_term renamable $sgpr4_sgpr5, 0, implicit-def $scc + ; REGALLOC-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr4_sgpr5, implicit $scc + ; REGALLOC-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc + ; REGALLOC-NEXT: S_BRANCH %bb.4 ; REGALLOC-NEXT: {{ $}} ; REGALLOC-NEXT: bb.2.bb.1: ; REGALLOC-NEXT: successors: %bb.4(0x80000000) ; REGALLOC-NEXT: {{ $}} ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) - ; REGALLOC-NEXT: renamable $sgpr4 = S_MOV_B32 10 - ; REGALLOC-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, killed $sgpr4, 0, implicit $exec + ; REGALLOC-NEXT: $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5 + ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3 + ; REGALLOC-NEXT: renamable $sgpr6 = S_MOV_B32 10 + ; REGALLOC-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, killed $sgpr6, 0, implicit $exec ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) + ; REGALLOC-NEXT: $exec = S_OR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; REGALLOC-NEXT: S_BRANCH %bb.4 ; REGALLOC-NEXT: {{ $}} ; REGALLOC-NEXT: bb.3.bb.2: ; REGALLOC-NEXT: successors: %bb.1(0x80000000) ; REGALLOC-NEXT: {{ $}} ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5) - ; REGALLOC-NEXT: renamable $sgpr4 = S_MOV_B32 20 - ; REGALLOC-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, killed $sgpr4, 0, implicit $exec + ; REGALLOC-NEXT: $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0, implicit-def $sgpr4_sgpr5 + ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1 + ; REGALLOC-NEXT: renamable $sgpr6 = S_MOV_B32 20 + ; REGALLOC-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, killed $sgpr6, 0, implicit $exec ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; REGALLOC-NEXT: $exec = S_OR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; REGALLOC-NEXT: S_BRANCH %bb.1 ; REGALLOC-NEXT: {{ $}} ; REGALLOC-NEXT: bb.4.bb.3: ; REGALLOC-NEXT: $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) - ; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5 - ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3 - ; REGALLOC-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) ; REGALLOC-NEXT: renamable $vgpr0 = V_LSHL_ADD_U32_e64 killed $vgpr0, 2, $vgpr0, implicit $exec ; REGALLOC-NEXT: KILL killed renamable $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir b/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir index 6483ff28c0de05..9848e6b70696ad 100644 --- a/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir +++ b/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir @@ -22,12 +22,11 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY1]], implicit $exec ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], killed [[V_CMP_NE_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_AND_B32_]], [[COPY3]], implicit-def dead $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term killed [[S_AND_B32_]] - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc + ; CHECK-NEXT: S_CMP_LG_U32_term [[V_CMP_NE_U32_e64_]], 0, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[V_CMP_NE_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.7(0x80000000) @@ -39,6 +38,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8), addrspace 1) + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, %12, implicit-def $scc ; CHECK-NEXT: S_BRANCH %bb.7 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -61,12 +61,12 @@ body: | ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[S_OR_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_OR_SAVEEXEC_B32 killed [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[S_OR_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_B32_1]], implicit-def $scc - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] + ; CHECK-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_XOR_B32_]], $exec_lo, implicit-def $scc + ; CHECK-NEXT: S_CMP_LG_U32_term [[S_XOR_B32_]], 0, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[S_XOR_B32_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.7 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: ; CHECK-NEXT: successors: %bb.5(0x80000000) @@ -75,7 +75,6 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7: - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, killed [[S_AND_B32_1]], implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.2(0x40000000), %bb.5(0x40000000) @@ -97,6 +96,7 @@ body: | %10:vgpr_32 = GLOBAL_LOAD_UBYTE killed %9, 0, 0, implicit $exec :: (load (s8), addrspace 1) %11:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec GLOBAL_STORE_BYTE killed %11, killed %10, 0, 0, implicit $exec :: (store (s8), addrspace 1) + SI_WAVE_RECONVERGE %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.7 bb.2: @@ -128,7 +128,6 @@ body: | S_BRANCH %bb.5 bb.7: - SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 04d72691a088ab..88f5a5a3d5f283 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -28,13 +28,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr26_sgpr27 = S_XOR_B64 killed renamable $sgpr26_sgpr27, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3) - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr24_sgpr25, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.2, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: - ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: successors: %bb.57(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr26_sgpr27, implicit-def dead $scc @@ -43,11 +43,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $vgpr24 = IMPLICIT_DEF ; GFX90A-NEXT: $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.59, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.57, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54, $sgpr55, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr54, $sgpr55, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF @@ -59,16 +59,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: - ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.58(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.56(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.58, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.56, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr4 = COPY renamable $sgpr17, implicit $exec @@ -82,7 +82,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 @@ -92,7 +92,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -129,72 +129,77 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr24 = COPY $sgpr15, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: - ; GFX90A-NEXT: successors: %bb.63(0x40000000), %bb.8(0x40000000) + ; GFX90A-NEXT: successors: %bb.61(0x40000000), %bb.8(0x40000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_AND_B64 killed renamable $sgpr28_sgpr29, $exec, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 - ; GFX90A-NEXT: $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $sgpr28_sgpr29, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.63, implicit $exec + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr28_sgpr29, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.61, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.8.Flow32: ; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.10, implicit $exec + ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr18_sgpr19, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.10, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.9.bb89: ; GFX90A-NEXT: successors: %bb.10(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec + ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr18_sgpr19, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.12, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.11.bb84: ; GFX90A-NEXT: successors: %bb.12(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec + ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr18_sgpr19, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.14, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: ; GFX90A-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr16_sgpr17, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.16, implicit $exec + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr8_sgpr9, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.16, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.15.bb72: ; GFX90A-NEXT: successors: %bb.16(0x80000000) @@ -206,83 +211,88 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM killed renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; GFX90A-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit undef $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.16.Flow36: ; GFX90A-NEXT: successors: %bb.17(0x40000000), %bb.18(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.18, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.18, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.17.bb67: ; GFX90A-NEXT: successors: %bb.18(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.18.Flow37: ; GFX90A-NEXT: successors: %bb.19(0x40000000), %bb.20(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.20, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.20, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.19.bb62: ; GFX90A-NEXT: successors: %bb.20(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.20.Flow38: ; GFX90A-NEXT: successors: %bb.21(0x40000000), %bb.22(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.22, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.22, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.21.bb54: ; GFX90A-NEXT: successors: %bb.22(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.22.Flow39: ; GFX90A-NEXT: successors: %bb.23(0x40000000), %bb.24(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.24, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.24, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.23.bb47: ; GFX90A-NEXT: successors: %bb.24(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.24.Flow40: ; GFX90A-NEXT: successors: %bb.25(0x40000000), %bb.26(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.26, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.26, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.25.bb40: ; GFX90A-NEXT: successors: %bb.26(0x80000000) @@ -291,15 +301,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.26.Flow41: ; GFX90A-NEXT: successors: %bb.27(0x40000000), %bb.28(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.28, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr38_sgpr39, $exec, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.28, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.27.bb33: ; GFX90A-NEXT: successors: %bb.28(0x80000000) @@ -308,21 +319,21 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.28.Flow42: ; GFX90A-NEXT: successors: %bb.34(0x40000000), %bb.29(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.34, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.34, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.29.Flow43: ; GFX90A-NEXT: successors: %bb.30(0x40000000), %bb.31(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.31, implicit $vcc ; GFX90A-NEXT: {{ $}} @@ -338,8 +349,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.32(0x40000000), %bb.33(0x40000000) ; GFX90A-NEXT: liveins: $sgpr54_sgpr55, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.33, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr4_sgpr5, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.33, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.32.UnifiedUnreachableBlock: ; GFX90A-NEXT: successors: %bb.33(0x80000000) @@ -359,12 +371,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.29 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: - ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 + ; GFX90A-NEXT: successors: %bb.36(0x40000000), %bb.6(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = COPY $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 @@ -376,9 +390,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -398,20 +412,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.36.Flow21: - ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: S_CMP_LG_U64 renamable $vcc, 0, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.6, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc - ; GFX90A-NEXT: S_BRANCH %bb.6 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.37.bb27: - ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr40_sgpr41 + ; GFX90A-NEXT: bb.36.bb27: + ; GFX90A-NEXT: successors: %bb.38(0x40000000), %bb.37(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr42_sgpr43 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = COPY $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 -1 @@ -437,38 +446,41 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec + ; GFX90A-NEXT: S_CMP_LG_U64 renamable $vcc, 0, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.38, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.38.Flow22: - ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.37.Flow22: + ; GFX90A-NEXT: successors: %bb.6(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_ANDN2_B64 killed renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_OR_B64 killed renamable $sgpr28_sgpr29, killed renamable $sgpr54_sgpr55, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.36 + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.39.bb34: - ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 + ; GFX90A-NEXT: bb.38.bb34: + ; GFX90A-NEXT: successors: %bb.40(0x40000000), %bb.39(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr62_sgpr63 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = COPY $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -487,40 +499,42 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec + ; GFX90A-NEXT: S_CMP_LG_U64 renamable $vcc, 0, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.40, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.40.Flow23: - ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.39.Flow23: + ; GFX90A-NEXT: successors: %bb.37(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr38_sgpr39, killed renamable $sgpr40_sgpr41, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.38 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr38_sgpr39, killed renamable $sgpr44_sgpr45, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.37 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.41.bb41: - ; GFX90A-NEXT: successors: %bb.47(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 + ; GFX90A-NEXT: bb.40.bb41: + ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.41(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr60_sgpr61 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = COPY $exec ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = COPY $vcc ; GFX90A-NEXT: renamable $vgpr59, dead renamable $sgpr16_sgpr17 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr16_sgpr17, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec :: (load (s8) from %ir.i42, addrspace 1) ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr18, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -538,44 +552,45 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.47, implicit $exec + ; GFX90A-NEXT: S_CMP_LG_U64 renamable $vcc, 0, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.46, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.42.Flow24: - ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.41.Flow24: + ; GFX90A-NEXT: successors: %bb.39(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr16_sgpr17, killed renamable $sgpr54_sgpr55, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.40 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_OR_B64 killed renamable $sgpr16_sgpr17, killed renamable $sgpr44_sgpr45, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.39 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.43.bb55: - ; GFX90A-NEXT: successors: %bb.49(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr44_sgpr45 + ; GFX90A-NEXT: bb.42.bb55: + ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.43(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr44_sgpr45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_XOR_B64 renamable $sgpr62_sgpr63, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr46_sgpr47, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.49, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.48, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.44: - ; GFX90A-NEXT: successors: %bb.45(0x80000000) + ; GFX90A-NEXT: bb.43: + ; GFX90A-NEXT: successors: %bb.44(0x80000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 @@ -594,43 +609,46 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.45.Flow26: - ; GFX90A-NEXT: successors: %bb.46(0x80000000) + ; GFX90A-NEXT: bb.44.Flow26: + ; GFX90A-NEXT: successors: %bb.45(0x80000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr16, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr60, $vgpr61, $vgpr62, $vgpr63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.46.Flow26: - ; GFX90A-NEXT: successors: %bb.48(0x80000000) + ; GFX90A-NEXT: bb.45.Flow26: + ; GFX90A-NEXT: successors: %bb.47(0x80000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.48 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.47 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.47.bb48: - ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.48(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr44_sgpr45 + ; GFX90A-NEXT: bb.46.bb48: + ; GFX90A-NEXT: successors: %bb.42(0x40000000), %bb.47(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr44_sgpr45 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = COPY $exec ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = COPY $vcc ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec :: (load (s8) from %ir.i49, addrspace 1) - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr16_sgpr17, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -646,36 +664,37 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr16_sgpr17 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec + ; GFX90A-NEXT: S_CMP_LG_U64 renamable $vcc, 0, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.42, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.48.Flow25: - ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.47.Flow25: + ; GFX90A-NEXT: successors: %bb.41(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr16_sgpr17, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr42_sgpr43, killed renamable $sgpr54_sgpr55, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.42 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_OR_B64 killed renamable $sgpr42_sgpr43, killed renamable $sgpr46_sgpr47, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.41 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.49.bb63: - ; GFX90A-NEXT: successors: %bb.51(0x40000000), %bb.50(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: bb.48.bb63: + ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) + ; GFX90A-NEXT: liveins: $vcc, $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.51, implicit $vcc + ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.50: - ; GFX90A-NEXT: successors: %bb.45(0x80000000) + ; GFX90A-NEXT: bb.49: + ; GFX90A-NEXT: successors: %bb.44(0x80000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 @@ -692,20 +711,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: S_BRANCH %bb.45 + ; GFX90A-NEXT: S_BRANCH %bb.44 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.51.bb68: - ; GFX90A-NEXT: successors: %bb.55(0x40000000), %bb.52(0x40000000) + ; GFX90A-NEXT: bb.50.bb68: + ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.51(0x40000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr46_sgpr47, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.55, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.52, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.52: - ; GFX90A-NEXT: successors: %bb.46(0x80000000) + ; GFX90A-NEXT: bb.51: + ; GFX90A-NEXT: successors: %bb.45(0x80000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 @@ -722,20 +741,48 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: S_BRANCH %bb.46 + ; GFX90A-NEXT: S_BRANCH %bb.45 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.52.bb73: + ; GFX90A-NEXT: successors: %bb.53(0x40000000), %bb.45(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY $exec + ; GFX90A-NEXT: renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) + ; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr50_sgpr51 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF + ; GFX90A-NEXT: S_CMP_LG_U64 renamable $vcc, 0, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.45, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53.bb80: - ; GFX90A-NEXT: successors: %bb.60(0x40000000), %bb.54(0x40000000) + ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.54(0x40000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr15 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr15, 0, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr48_sgpr49 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.60, implicit killed $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.58, implicit killed $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54: - ; GFX90A-NEXT: successors: %bb.62(0x80000000) + ; GFX90A-NEXT: successors: %bb.60(0x80000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 @@ -751,43 +798,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: S_BRANCH %bb.62 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.55.bb73: - ; GFX90A-NEXT: successors: %bb.53(0x40000000), %bb.56(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51 + ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) - ; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 - ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr56_sgpr57 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr58_sgpr59 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.53, implicit $exec - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.56.Flow29: - ; GFX90A-NEXT: successors: %bb.46(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr58_sgpr59, implicit-def $scc - ; GFX90A-NEXT: S_BRANCH %bb.46 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.57.bb90: - ; GFX90A-NEXT: successors: %bb.61(0x80000000) + ; GFX90A-NEXT: bb.55.bb90: + ; GFX90A-NEXT: successors: %bb.59(0x80000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr62_sgpr63, implicit $exec @@ -805,11 +819,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec - ; GFX90A-NEXT: S_BRANCH %bb.61 + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.59 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.58: + ; GFX90A-NEXT: bb.56: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $exec, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $exec, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr40_sgpr41, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr15 = COPY killed renamable $sgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr15, implicit $exec @@ -820,7 +835,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -844,9 +859,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.7 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.59.bb105: + ; GFX90A-NEXT: bb.57.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) @@ -863,16 +878,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr15 = S_MOV_B32 0 ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.60.bb85: - ; GFX90A-NEXT: successors: %bb.57(0x40000000), %bb.61(0x40000000) + ; GFX90A-NEXT: bb.58.bb85: + ; GFX90A-NEXT: successors: %bb.55(0x40000000), %bb.59(0x40000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY $exec ; GFX90A-NEXT: renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) ; GFX90A-NEXT: renamable $sgpr15 = S_MOV_B32 0 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF @@ -882,18 +898,18 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr50_sgpr51 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.57, implicit $exec + ; GFX90A-NEXT: S_CMP_LG_U64 renamable $vcc, 0, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.55, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.61.Flow31: - ; GFX90A-NEXT: successors: %bb.62(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.59.Flow31: + ; GFX90A-NEXT: successors: %bb.60(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.62.Flow30: - ; GFX90A-NEXT: successors: %bb.56(0x80000000) + ; GFX90A-NEXT: bb.60.Flow30: + ; GFX90A-NEXT: successors: %bb.45(0x80000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -902,48 +918,59 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, killed renamable $sgpr54_sgpr55, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.56 + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr58_sgpr59, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.63.bb140: - ; GFX90A-NEXT: successors: %bb.69(0x40000000), %bb.64(0x40000000) + ; GFX90A-NEXT: bb.61.bb140: + ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.62(0x40000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr26_sgpr27, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.69, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.68, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.64.Flow13: - ; GFX90A-NEXT: successors: %bb.65(0x40000000), %bb.67(0x40000000) + ; GFX90A-NEXT: bb.62.Flow13: + ; GFX90A-NEXT: successors: %bb.63(0x40000000), %bb.66(0x40000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.67, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.65.bb159: - ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.66(0x40000000) + ; GFX90A-NEXT: bb.63.bb159: + ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.64(0x40000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.68, implicit $exec + ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 renamable $vcc, $exec, implicit-def $scc + ; GFX90A-NEXT: S_CMP_LG_U64 renamable $vcc, 0, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.67, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.66.Flow10: - ; GFX90A-NEXT: successors: %bb.67(0x80000000) + ; GFX90A-NEXT: bb.64.Flow10: + ; GFX90A-NEXT: successors: %bb.65(0x40000000), %bb.66(0x40000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_ANDN2_SAVEEXEC_B64 $sgpr8_sgpr9, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr8_sgpr9, $exec, implicit-def $scc + ; GFX90A-NEXT: S_CMP_LG_U64 renamable $sgpr8_sgpr9, 0, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr8_sgpr9, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.66, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.67.Flow14: + ; GFX90A-NEXT: bb.65.bb160: + ; GFX90A-NEXT: successors: %bb.66(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.66.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = COPY $exec + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.68.bb161: - ; GFX90A-NEXT: successors: %bb.66(0x80000000) + ; GFX90A-NEXT: bb.67.bb161: + ; GFX90A-NEXT: successors: %bb.64(0x80000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec @@ -959,10 +986,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) - ; GFX90A-NEXT: S_BRANCH %bb.66 + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, renamable $sgpr8_sgpr9, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.64 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.69.bb174: - ; GFX90A-NEXT: successors: %bb.73(0x40000000), %bb.70(0x40000000) + ; GFX90A-NEXT: bb.68.bb174: + ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec @@ -975,17 +1003,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr8_sgpr9, implicit $exec ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.73, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.70.Flow: - ; GFX90A-NEXT: successors: %bb.71(0x40000000), %bb.72(0x40000000) + ; GFX90A-NEXT: bb.69.Flow: + ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.71.bb186: - ; GFX90A-NEXT: successors: %bb.72(0x80000000) + ; GFX90A-NEXT: bb.70.bb186: + ; GFX90A-NEXT: successors: %bb.71(0x80000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec @@ -1013,15 +1041,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.72.Flow9: - ; GFX90A-NEXT: successors: %bb.64(0x80000000) + ; GFX90A-NEXT: bb.71.Flow9: + ; GFX90A-NEXT: successors: %bb.62(0x80000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 0 - ; GFX90A-NEXT: S_BRANCH %bb.64 + ; GFX90A-NEXT: S_BRANCH %bb.62 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.73.bb196: - ; GFX90A-NEXT: successors: %bb.70(0x80000000) + ; GFX90A-NEXT: bb.72.bb196: + ; GFX90A-NEXT: successors: %bb.69(0x80000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec @@ -1029,7 +1057,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_MOV_B64 0 - ; GFX90A-NEXT: S_BRANCH %bb.70 + ; GFX90A-NEXT: S_BRANCH %bb.69 bb: %i = tail call i32 @llvm.amdgcn.workitem.id.x() %i11 = icmp eq i32 %i, 0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 635f3e4886b875..a974c0d39004a2 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -167,17 +167,19 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: v_addc_u32_e64 v1, s[0:1], 0, v1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_cbranch_execnz .LBB3_1 +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN-NEXT: ; %bb.3: ; %bb -; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_getpc_b64 s[0:1] ; GCN-NEXT: .Lpost_getpc2: -; GCN-NEXT: s_add_u32 s4, s4, (.LBB3_2-.Lpost_getpc2)&4294967295 -; GCN-NEXT: s_addc_u32 s5, s5, (.LBB3_2-.Lpost_getpc2)>>32 -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s0, s0, (.LBB3_2-.Lpost_getpc2)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB3_2-.Lpost_getpc2)>>32 +; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB3_1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; 32 bytes @@ -186,8 +188,8 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB3_2: ; %bb3 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -444,13 +446,15 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GCN-LABEL: uniform_inside_divergent: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execnz .LBB8_1 -; GCN-NEXT: ; %bb.4: ; %entry +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_and_b64 s[0:1], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc1 .LBB8_1 +; GCN-NEXT: ; %bb.5: ; %entry ; GCN-NEXT: s_getpc_b64 s[0:1] ; GCN-NEXT: .Lpost_getpc9: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB8_3-.Lpost_getpc9)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB8_3-.Lpost_getpc9)>>32 +; GCN-NEXT: s_add_u32 s0, s0, (.LBB8_4-.Lpost_getpc9)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB8_4-.Lpost_getpc9)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB8_1: ; %if ; GCN-NEXT: s_load_dword s6, s[2:3], 0xb @@ -466,8 +470,9 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: .LBB8_3: ; %endif +; GCN-NEXT: .LBB8_3: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB8_4: ; %endif ; GCN-NEXT: s_sleep 5 ; GCN-NEXT: s_endpgm entry: @@ -500,23 +505,33 @@ define amdgpu_kernel void @analyze_mask_branch() #0 { ; GCN-NEXT: v_mov_b32_e64 v0, 0 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %ret +; GCN-NEXT: s_xor_b64 s[0:1], vcc, exec +; GCN-NEXT: s_and_b64 s[2:3], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc1 .LBB9_1 +; GCN-NEXT: ; %bb.6: ; %entry +; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: .Lpost_getpc10: +; GCN-NEXT: s_add_u32 s2, s2, (.LBB9_2-.Lpost_getpc10)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB9_2-.Lpost_getpc10)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: .LBB9_1: ; %ret ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 7 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: .LBB9_2: ; %Flow1 -; GCN-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.6: ; %Flow1 +; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; GCN-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB9_3 +; GCN-NEXT: ; %bb.8: ; %Flow1 ; GCN-NEXT: s_getpc_b64 s[0:1] -; GCN-NEXT: .Lpost_getpc10: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc10)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc10)>>32 +; GCN-NEXT: .Lpost_getpc11: +; GCN-NEXT: s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc11)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc11)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB9_3: ; %loop.preheader ; GCN-NEXT: s_and_b64 vcc, exec, 0 @@ -534,12 +549,12 @@ define amdgpu_kernel void @analyze_mask_branch() #0 { ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_mov_b64 vcc, vcc ; GCN-NEXT: s_cbranch_vccnz .LBB9_5 -; GCN-NEXT: ; %bb.8: ; %loop +; GCN-NEXT: ; %bb.10: ; %loop ; GCN-NEXT: ; in Loop: Header=BB9_4 Depth=1 ; GCN-NEXT: s_getpc_b64 s[0:1] -; GCN-NEXT: .Lpost_getpc11: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc11)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc11)>>32 +; GCN-NEXT: .Lpost_getpc12: +; GCN-NEXT: s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc12)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc12)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB9_5: ; %UnifiedReturnBlock ; GCN-NEXT: s_endpgm @@ -582,9 +597,9 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_cbranch_scc1 .LBB10_1 ; GCN-NEXT: ; %bb.8: ; %bb ; GCN-NEXT: s_getpc_b64 s[8:9] -; GCN-NEXT: .Lpost_getpc12: -; GCN-NEXT: s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc12)&4294967295 -; GCN-NEXT: s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc12)>>32 +; GCN-NEXT: .Lpost_getpc13: +; GCN-NEXT: s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc13)&4294967295 +; GCN-NEXT: s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc13)>>32 ; GCN-NEXT: s_setpc_b64 s[8:9] ; GCN-NEXT: .LBB10_1: ; %bb13 ; GCN-NEXT: ;;#ASMSTART @@ -608,9 +623,9 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-NEXT: ; %bb.10: ; %Flow5 ; GCN-NEXT: s_getpc_b64 s[0:1] -; GCN-NEXT: .Lpost_getpc13: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB10_6-.Lpost_getpc13)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB10_6-.Lpost_getpc13)>>32 +; GCN-NEXT: .Lpost_getpc14: +; GCN-NEXT: s_add_u32 s0, s0, (.LBB10_6-.Lpost_getpc14)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB10_6-.Lpost_getpc14)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB10_5: ; %bb14 ; GCN-NEXT: s_cmp_lt_i32 s5, 9 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 63cdd8a3bb16dc..ed9fd9f3640e67 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -77,10 +77,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -121,10 +121,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -152,10 +152,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -183,10 +183,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -215,10 +215,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB0_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 @@ -289,10 +289,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -344,11 +344,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -374,11 +374,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -405,11 +405,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v2, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB1_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 @@ -441,12 +441,13 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB2_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -469,12 +470,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 sc0 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, v5 ; GFX940-NEXT: buffer_inv sc1 @@ -498,12 +499,13 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 glc +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB2_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: buffer_gl1_inv @@ -526,22 +528,21 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: .LBB2_3: ; %atomicrmw.start +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB2_3 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v7, v8, v5 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v6, v7 ; GFX10-NEXT: v_mov_b32_e32 v7, v8 -; GFX10-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 +; GFX10-NEXT: .LBB2_3: ; Parent Loop BB2_2 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -553,22 +554,22 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB2_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_3 +; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB2_2 Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB2_3 -; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX10-NEXT: ; %bb.5: ; %atomicrmw.end ; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -587,12 +588,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_add_f32 v5, v4, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: buffer_wbinvl1 @@ -614,11 +615,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 @@ -640,19 +641,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_4 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB2_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -672,11 +673,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 @@ -698,19 +699,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_4 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB2_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -729,11 +730,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 @@ -755,19 +756,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB2_4 +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB2_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -786,11 +787,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB2_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 @@ -812,19 +813,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB2_4 +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB2_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB2_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -897,10 +898,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -927,10 +928,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -958,10 +959,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -989,10 +990,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -1020,10 +1021,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -1052,10 +1053,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB3_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 @@ -1126,10 +1127,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -1154,11 +1155,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -1184,11 +1185,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -1214,11 +1215,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -1244,11 +1245,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -1275,11 +1276,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v2, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB4_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 @@ -1336,11 +1337,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: @@ -1370,10 +1371,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: @@ -1400,10 +1401,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: @@ -1431,10 +1432,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: @@ -1462,10 +1463,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: @@ -1493,10 +1494,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: @@ -1525,10 +1526,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 @@ -1585,11 +1586,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: @@ -1619,10 +1620,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: @@ -1649,10 +1650,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: @@ -1680,10 +1681,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: @@ -1711,10 +1712,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: @@ -1742,10 +1743,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: @@ -1774,10 +1775,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 @@ -1834,11 +1835,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -1868,10 +1869,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -1898,10 +1899,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -1929,10 +1930,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -1960,10 +1961,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -1991,10 +1992,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -2023,10 +2024,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 @@ -2068,11 +2069,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2110,11 +2111,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2148,10 +2149,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2196,10 +2197,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2231,10 +2232,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2266,10 +2267,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2302,10 +2303,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 @@ -2341,11 +2342,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2382,11 +2383,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2418,10 +2419,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v4, v7 ; GFX10-NEXT: v_mov_b32_e32 v5, v8 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2463,11 +2464,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2496,11 +2497,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2529,11 +2530,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v5, v8 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2563,11 +2564,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v5, v8 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 @@ -2601,11 +2602,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB10_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 @@ -2631,20 +2633,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_4 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] ; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB10_3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB10_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2666,12 +2669,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, v6 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 @@ -2696,18 +2699,17 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB10_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB10_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB10_3 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -2715,7 +2717,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX11-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 +; GFX11-NEXT: .LBB10_3: ; Parent Loop BB10_2 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 ; GFX11-NEXT: v_readfirstlane_b32 s5, v10 @@ -2729,21 +2731,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_3 +; GFX11-NEXT: ; %bb.4: ; in Loop: Header=BB10_2 Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] ; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB10_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_2 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.end ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2766,15 +2769,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: .LBB10_3: ; %atomicrmw.start +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX10-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB10_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB10_3 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -2783,7 +2785,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; GFX10-NEXT: v_mov_b32_e32 v3, v14 -; GFX10-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 +; GFX10-NEXT: .LBB10_3: ; Parent Loop BB10_2 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v9 ; GFX10-NEXT: v_readfirstlane_b32 s9, v10 @@ -2795,11 +2797,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_3 +; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB10_2 Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] ; GFX10-NEXT: v_mov_b32_e32 v14, v1 @@ -2807,11 +2809,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB10_3 -; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_2 +; GFX10-NEXT: ; %bb.5: ; %atomicrmw.end ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2831,12 +2833,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -2863,11 +2865,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB10_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 @@ -2891,20 +2893,20 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_4 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] ; GFX908-NEXT: v_mov_b32_e32 v14, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v13, v0 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB10_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2927,11 +2929,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB10_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 @@ -2955,20 +2957,20 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_4 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] ; GFX8-NEXT: v_mov_b32_e32 v14, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB10_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2990,11 +2992,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB10_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 @@ -3018,20 +3020,20 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_4 +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] ; GFX7-NEXT: v_mov_b32_e32 v14, v1 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v13, v0 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB10_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3053,11 +3055,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB10_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 @@ -3082,20 +3084,20 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_4 +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] ; GFX6-NEXT: v_mov_b32_e32 v14, v1 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v13, v0 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB10_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB10_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 @@ -3133,11 +3135,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: @@ -3175,11 +3177,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: @@ -3213,10 +3215,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: @@ -3245,10 +3247,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: @@ -3280,10 +3282,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: @@ -3315,10 +3317,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: @@ -3350,10 +3352,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: @@ -3386,10 +3388,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 @@ -3427,11 +3429,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -3469,11 +3471,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -3507,10 +3509,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -3555,10 +3557,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -3590,10 +3592,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -3625,10 +3627,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -3661,10 +3663,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 @@ -3689,12 +3691,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_and_b32 s4, s6, -4 ; GFX12-NEXT: v_mov_b32_e32 v5, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s5, s5 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3706,19 +3708,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: v_and_or_b32 v1, v2, s5, v1 ; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX12-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3748,11 +3750,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -3764,12 +3766,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-NEXT: s_and_b32 s4, s6, -4 ; GFX11-NEXT: v_mov_b32_e32 v5, s4 ; GFX11-NEXT: s_and_b32 s4, s6, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s5, s5 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3780,7 +3783,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-NEXT: v_and_or_b32 v1, v2, s5, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc @@ -3789,12 +3792,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX11-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX11-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3809,11 +3812,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: s_mov_b32 s9, s7 ; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_and_b32 s4, s18, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: buffer_load_dword v2, v5, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_not_b32 s5, s5 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3821,7 +3824,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, s5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[8:11], 0 offen glc @@ -3830,11 +3833,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3867,11 +3870,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3905,11 +3908,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -3944,11 +3947,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3987,11 +3990,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4032,11 +4035,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -4059,12 +4062,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_and_b32 s4, s6, -4 ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s5, s5 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4076,19 +4079,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: v_and_or_b32 v1, v2, s5, v1 ; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX12-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4117,11 +4120,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4132,12 +4135,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_and_b32 s4, s6, -4 ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_and_b32 s4, s6, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s5, s5 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4148,7 +4152,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-NEXT: v_and_or_b32 v1, v2, s5, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc @@ -4157,12 +4161,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX11-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4176,11 +4180,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: s_mov_b32 s9, s7 ; GFX10-NEXT: s_mov_b32 s8, s6 ; GFX10-NEXT: s_and_b32 s4, s18, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_not_b32 s5, s5 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4188,7 +4192,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, s5, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc @@ -4197,11 +4201,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4233,11 +4237,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4270,11 +4274,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4308,11 +4312,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4350,11 +4354,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4393,11 +4397,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -4435,10 +4439,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 @@ -4469,32 +4474,33 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 ; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB15_3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_add_u32_e32 v6, 0x200, v4 +; GFX940-NEXT: v_and_b32_e32 v4, -4, v6 +; GFX940-NEXT: v_and_b32_e32 v6, 3, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v10, 3, v6 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0 +; GFX940-NEXT: v_lshlrev_b32_e64 v6, v10, s0 ; GFX940-NEXT: v_not_b32_e32 v11, v6 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -4507,19 +4513,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v10, v7 ; GFX940-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, v10, v6 ; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7] @@ -4536,21 +4542,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_4 +; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[4:7], 0 offen sc0 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v7, v8 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB15_3 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v10, v8 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4578,14 +4584,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB15_3 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -4598,7 +4604,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 -; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX11-NEXT: .LBB15_3: ; Parent Loop BB15_2 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -4612,21 +4618,22 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_3 +; GFX11-NEXT: ; %bb.4: ; in Loop: Header=BB15_2 Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB15_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_2 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.end ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4651,14 +4658,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX10-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB15_3 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -4668,7 +4674,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; GFX10-NEXT: v_mov_b32_e32 v8, v6 -; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX10-NEXT: .LBB15_3: ; Parent Loop BB15_2 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -4680,34 +4686,34 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_3 +; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB15_2 Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 ; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_3 -; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_2 +; GFX10-NEXT: ; %bb.5: ; %atomicrmw.end ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_add_u32_e32 v6, 0x200, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, -4, v6 +; GFX90A-NEXT: v_and_b32_e32 v6, 3, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v10, 3, v6 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v10, s4 ; GFX90A-NEXT: v_not_b32_e32 v11, v6 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -4720,19 +4726,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v10, v7 ; GFX90A-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v10, v6 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] @@ -4747,32 +4753,32 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 +; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v7, v8 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v10, v8 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_add_u32_e32 v6, 0x200, v4 +; GFX908-NEXT: v_and_b32_e32 v4, -4, v6 +; GFX908-NEXT: v_and_b32_e32 v6, 3, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v10, 3, v6 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX908-NEXT: v_lshlrev_b32_e64 v6, v10, s4 ; GFX908-NEXT: v_not_b32_e32 v11, v6 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -4785,19 +4791,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX908-NEXT: v_lshrrev_b32_e32 v6, v10, v7 ; GFX908-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, v10, v6 ; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX908-NEXT: v_mov_b32_e32 v9, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec @@ -4813,32 +4819,32 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_4 +; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v10, v8 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x200, v4 +; GFX8-NEXT: v_and_b32_e32 v4, -4, v6 +; GFX8-NEXT: v_and_b32_e32 v6, 3, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 3, v6 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v6, v10, s4 ; GFX8-NEXT: v_not_b32_e32 v11, v6 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -4851,19 +4857,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, v10, v7 ; GFX8-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, v10, v6 ; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 ; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 ; GFX8-NEXT: v_mov_b32_e32 v9, v7 @@ -4880,31 +4886,31 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_4 +; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB15_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v10, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -4916,11 +4922,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 @@ -4928,13 +4934,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 @@ -4949,21 +4955,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB15_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4971,10 +4977,10 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -4986,11 +4992,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 @@ -4998,13 +5004,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 @@ -5019,21 +5025,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_4 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB15_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5057,15 +5063,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s5, s5 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5085,19 +5090,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX12-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5135,11 +5140,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -5149,15 +5154,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s5, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5177,7 +5181,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc @@ -5186,12 +5190,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX11-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX11-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5209,9 +5213,9 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_not_b32 s5, s5 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5224,7 +5228,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc @@ -5233,11 +5237,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5277,11 +5281,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[14:15], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[14:15], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5322,11 +5326,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[14:15], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_cselect_b64 exec, s[14:15], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5369,11 +5373,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5412,11 +5416,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5457,11 +5461,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5482,15 +5486,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s5, s5 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5510,19 +5513,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX12-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5559,11 +5562,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5572,15 +5575,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s5, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5600,7 +5602,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc @@ -5609,12 +5611,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX11-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX11-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5631,9 +5633,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_not_b32 s5, s5 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5646,7 +5648,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc @@ -5655,11 +5657,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5698,11 +5700,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[14:15], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[14:15], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5742,11 +5744,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[14:15], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cselect_b64 exec, s[14:15], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5788,11 +5790,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5830,11 +5832,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5873,11 +5875,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 @@ -5915,10 +5917,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start @@ -5959,20 +5962,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB18_3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5980,11 +5984,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX940-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v9, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v9, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -5997,11 +6001,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: buffer_load_dword v7, v8, s[4:7], 0 offen +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff @@ -6009,7 +6013,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_add_f32_e32 v4, v4, v11 ; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -6019,7 +6023,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 @@ -6034,21 +6038,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB18_3 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v9, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6076,10 +6080,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 @@ -6121,22 +6126,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6161,11 +6167,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 @@ -6196,22 +6202,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB18_3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6219,11 +6225,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v9, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6236,11 +6242,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: buffer_load_dword v7, v8, s[8:11], 0 offen +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff @@ -6248,14 +6254,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_add_f32_e32 v4, v4, v11 ; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] @@ -6270,32 +6276,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v9, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6308,11 +6314,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff @@ -6320,14 +6326,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 ; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec @@ -6343,32 +6349,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB18_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6381,18 +6387,18 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 @@ -6400,7 +6406,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, v5 @@ -6417,31 +6423,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_4 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB18_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6453,11 +6459,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 @@ -6465,11 +6471,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 @@ -6486,21 +6492,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB18_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6508,10 +6514,10 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6523,11 +6529,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 @@ -6535,11 +6541,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 @@ -6556,21 +6562,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_4 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB18_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -6632,11 +6638,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6666,10 +6672,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6710,10 +6716,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6743,10 +6749,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6776,27 +6782,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6826,29 +6832,28 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB19_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6905,11 +6910,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6938,10 +6943,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6995,11 +7000,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -7029,27 +7034,27 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -7077,31 +7082,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB20_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7132,12 +7136,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -7160,12 +7165,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, v5 ; GFX940-NEXT: buffer_inv sc1 @@ -7189,15 +7194,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB21_3 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v7, v8, v5 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -7205,7 +7210,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v6, v7 ; GFX11-NEXT: v_mov_b32_e32 v7, v8 -; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX11-NEXT: .LBB21_3: ; Parent Loop BB21_2 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -7219,21 +7224,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_3 +; GFX11-NEXT: ; %bb.4: ; in Loop: Header=BB21_2 Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB21_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_2 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.end ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7253,22 +7259,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX10-NEXT: .LBB21_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB21_3 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v7, v8, v5 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v6, v7 ; GFX10-NEXT: v_mov_b32_e32 v7, v8 -; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX10-NEXT: .LBB21_3: ; Parent Loop BB21_2 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -7280,22 +7285,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_3 +; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB21_2 Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB21_3 -; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_2 +; GFX10-NEXT: ; %bb.5: ; %atomicrmw.end ; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7314,12 +7319,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: buffer_wbinvl1 @@ -7341,11 +7346,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 @@ -7367,19 +7372,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB21_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -7399,11 +7404,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 @@ -7427,26 +7432,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB21_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7458,19 +7463,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 @@ -7481,15 +7486,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v10 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v11 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v9 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v10 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7501,22 +7506,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB21_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7524,7 +7529,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -7536,19 +7541,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v10, v5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 @@ -7557,18 +7562,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v10 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v11 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v10 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -7580,25 +7584,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB21_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7654,11 +7657,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: @@ -7688,10 +7691,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: @@ -7718,10 +7721,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: @@ -7749,10 +7752,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: @@ -7782,10 +7785,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: @@ -7815,27 +7818,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: @@ -7865,29 +7868,28 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -7944,11 +7946,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: @@ -7977,10 +7979,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: @@ -8005,11 +8007,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: @@ -8035,11 +8037,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: @@ -8067,11 +8069,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: @@ -8101,27 +8103,27 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: @@ -8149,31 +8151,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -8229,11 +8230,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: @@ -8263,10 +8264,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: @@ -8293,10 +8294,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: @@ -8324,10 +8325,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: @@ -8357,10 +8358,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: @@ -8390,27 +8391,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: @@ -8440,29 +8441,28 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -8519,11 +8519,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: @@ -8552,10 +8552,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: @@ -8580,11 +8580,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: @@ -8610,11 +8610,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: @@ -8642,11 +8642,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: @@ -8676,27 +8676,27 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: @@ -8724,31 +8724,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -8814,10 +8813,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX940-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -8865,12 +8864,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX11-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX11-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -8916,10 +8915,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -8964,10 +8963,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -9013,10 +9012,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -9063,10 +9062,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -9106,13 +9105,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -9153,13 +9152,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -9220,11 +9219,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX940-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -9268,12 +9267,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX11-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX11-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -9318,10 +9317,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -9364,11 +9363,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -9412,11 +9411,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -9461,11 +9460,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -9505,13 +9504,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -9552,13 +9551,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -9590,12 +9589,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -9617,11 +9617,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff @@ -9663,19 +9663,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_4 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX940-NEXT: s_cbranch_scc1 .LBB28_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB28_3 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB28_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -9697,15 +9697,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB28_4 Depth 2 @@ -9746,22 +9745,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB28_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB28_3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB28_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9781,12 +9780,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX10-NEXT: .LBB28_3: ; %atomicrmw.start @@ -9824,22 +9823,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB28_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB28_3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB28_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9859,11 +9858,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff @@ -9902,19 +9901,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_4 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX90A-NEXT: s_cbranch_scc1 .LBB28_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_3 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB28_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9934,11 +9933,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff @@ -9978,19 +9977,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_4 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB28_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB28_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB28_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -10010,11 +10009,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 @@ -10055,19 +10054,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_4 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB28_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB28_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB28_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -10086,11 +10085,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6 @@ -10102,19 +10101,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX7-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v10 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_alignbit_b32 v7, v4, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v6, v7 ; GFX7-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -10126,21 +10125,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB28_4 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v8, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB28_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v7 ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -10160,11 +10159,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v6 @@ -10176,19 +10175,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX6-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v10 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v6, v7, v6, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_alignbit_b32 v7, v4, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v6, v7 ; GFX6-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -10200,24 +10200,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB28_4 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v8, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB28_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB28_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB28_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v7 ; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -10279,10 +10279,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX940-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: @@ -10330,12 +10330,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX11-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX11-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: @@ -10381,10 +10381,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: @@ -10429,10 +10429,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: @@ -10478,10 +10478,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: @@ -10528,10 +10528,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: @@ -10571,13 +10571,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: @@ -10618,13 +10618,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -10685,11 +10685,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX940-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: @@ -10733,12 +10733,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX11-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX11-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: @@ -10783,10 +10783,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: @@ -10829,11 +10829,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: @@ -10877,11 +10877,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: @@ -10926,11 +10926,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: @@ -10970,13 +10970,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: @@ -11017,13 +11017,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -11086,10 +11086,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX940-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11137,12 +11137,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX11-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX11-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11188,10 +11188,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11236,10 +11236,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11285,10 +11285,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11335,10 +11335,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11378,13 +11378,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11425,13 +11425,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -11492,11 +11492,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX940-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11540,12 +11540,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX11-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX11-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11590,10 +11590,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11636,11 +11636,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11684,11 +11684,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11733,11 +11733,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11777,13 +11777,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: @@ -11824,13 +11824,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -11891,11 +11891,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX940-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -11939,12 +11939,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX11-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX11-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -11989,10 +11989,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -12035,11 +12035,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -12083,11 +12083,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -12132,11 +12132,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -12176,13 +12176,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -12223,13 +12223,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -12305,10 +12305,10 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -12337,10 +12337,10 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -12368,10 +12368,10 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -12399,10 +12399,10 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -12430,10 +12430,10 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -12462,10 +12462,10 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index c90296124eb127..4cf06aa4e6b27e 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -52,10 +52,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -110,10 +110,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -143,10 +143,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -176,10 +176,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -250,11 +250,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -307,11 +307,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -339,11 +339,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -371,11 +371,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -433,12 +433,13 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB2_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -460,11 +461,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX940-NEXT: .LBB2_3: ; %atomicrmw.start @@ -489,19 +490,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_4 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB2_3 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -523,12 +524,13 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_max_f32 v5, v4, s[4:7], 0 offen offset:1024 glc +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB2_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: buffer_gl1_inv @@ -551,13 +553,13 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_fmax v5, v4, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: buffer_gl1_inv @@ -580,11 +582,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start @@ -607,19 +609,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_4 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -639,11 +641,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_max_f32_e32 v8, v5, v5 ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start @@ -667,19 +669,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_4 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB2_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -699,11 +701,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v5 ; GFX8-NEXT: .LBB2_3: ; %atomicrmw.start @@ -727,19 +729,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_4 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB2_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -758,12 +760,12 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_fmax v5, v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, v5 ; GFX7-NEXT: buffer_wbinvl1 @@ -784,12 +786,12 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_fmax v5, v4, s[8:11], 0 offen offset:1024 glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB2_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v5 ; GFX6-NEXT: buffer_wbinvl1 @@ -839,10 +841,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: @@ -870,11 +872,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: @@ -906,10 +908,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: @@ -938,10 +940,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: @@ -971,10 +973,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: @@ -1004,10 +1006,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: @@ -1037,10 +1039,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: @@ -1071,10 +1073,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB3_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 @@ -1121,10 +1123,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1179,10 +1181,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1212,10 +1214,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1245,10 +1247,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1318,11 +1320,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1346,6 +1348,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1363,11 +1366,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1429,10 +1432,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1466,10 +1469,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1534,11 +1537,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1561,6 +1564,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1577,11 +1581,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1640,11 +1644,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX908-NEXT: v_mov_b32_e32 v2, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1675,11 +1679,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, v7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1739,11 +1743,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start @@ -1772,20 +1777,21 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_4 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] ; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB7_3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB7_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -1807,12 +1813,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, v6 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 @@ -1837,14 +1843,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start @@ -1872,21 +1878,22 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] ; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB7_3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -1905,13 +1912,13 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 @@ -1936,12 +1943,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -1968,11 +1975,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start @@ -1998,20 +2005,20 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_4 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] ; GFX908-NEXT: v_mov_b32_e32 v14, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v13, v0 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB7_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2034,11 +2041,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start @@ -2064,20 +2071,20 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_4 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] ; GFX8-NEXT: v_mov_b32_e32 v14, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB7_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2095,12 +2102,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, v5 ; GFX7-NEXT: v_mov_b32_e32 v1, v6 @@ -2122,12 +2129,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v5 ; GFX6-NEXT: v_mov_b32_e32 v1, v6 @@ -2171,11 +2178,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: @@ -2199,6 +2206,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2216,11 +2224,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: @@ -2256,10 +2264,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: @@ -2290,10 +2298,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: @@ -2327,10 +2335,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: @@ -2364,10 +2372,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: @@ -2401,10 +2409,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: @@ -2439,10 +2447,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 @@ -2482,11 +2490,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2510,6 +2518,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2527,11 +2536,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2593,10 +2602,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2630,10 +2639,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2682,15 +2691,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s5, s5 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2703,7 +2711,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2711,12 +2719,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX12-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2748,11 +2756,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2762,15 +2770,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s5, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2784,7 +2791,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2792,12 +2799,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX11-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2815,9 +2822,9 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_not_b32 s5, s5 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2826,7 +2833,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc @@ -2835,11 +2842,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2874,11 +2881,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2914,11 +2921,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2955,11 +2962,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2998,11 +3005,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3043,11 +3050,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -3068,15 +3075,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s5, s5 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3089,7 +3095,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -3097,12 +3103,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX12-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3133,11 +3139,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3146,15 +3152,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s5, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3168,7 +3173,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3176,12 +3181,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX11-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3198,9 +3203,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_not_b32 s5, s5 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3209,7 +3214,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc @@ -3218,11 +3223,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3256,11 +3261,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3295,11 +3300,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3335,11 +3340,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3377,11 +3382,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3420,11 +3425,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -3462,10 +3467,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start @@ -3500,20 +3506,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_4 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB12_3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3521,11 +3528,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX940-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v9, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v9, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 @@ -3538,21 +3545,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: buffer_load_dword v7, v8, s[4:7], 0 offen +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v9, v7 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v11 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v9, v4 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] @@ -3569,21 +3576,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_4 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB12_3 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v9, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3611,10 +3618,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start @@ -3648,21 +3656,22 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB12_3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3687,11 +3696,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 @@ -3718,22 +3727,22 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB12_3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3741,11 +3750,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v9, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 @@ -3758,21 +3767,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: buffer_load_dword v7, v8, s[8:11], 0 offen +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v9, v7 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v11 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v9, v4 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] @@ -3787,32 +3796,32 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v9, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 @@ -3825,21 +3834,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec @@ -3855,32 +3864,32 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_4 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB12_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 @@ -3893,21 +3902,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, v5 @@ -3924,31 +3933,31 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_4 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB12_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 @@ -3960,11 +3969,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 @@ -3972,13 +3981,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_max_f32_e32 v4, v4, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 @@ -3993,21 +4002,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB12_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4015,10 +4024,10 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 @@ -4030,11 +4039,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 @@ -4042,13 +4051,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_max_f32_e32 v4, v4, v10 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 @@ -4063,21 +4072,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_4 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB12_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4101,15 +4110,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s5, s5 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4129,19 +4137,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX12-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4179,11 +4187,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4193,15 +4201,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s5, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4221,7 +4228,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc @@ -4230,12 +4237,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX11-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX11-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4253,9 +4260,9 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_not_b32 s5, s5 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4268,7 +4275,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc @@ -4277,11 +4284,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4321,11 +4328,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[14:15], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[14:15], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4366,11 +4373,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[14:15], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[14:15], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4413,11 +4420,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4457,11 +4464,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4503,11 +4510,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -4528,15 +4535,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s5, s5 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4556,19 +4562,19 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX12-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4605,11 +4611,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4618,15 +4624,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s5, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4646,7 +4651,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc @@ -4655,12 +4660,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX11-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4677,9 +4682,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_not_b32 s5, s5 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4692,7 +4697,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc @@ -4701,11 +4706,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4744,11 +4749,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[14:15], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[14:15], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4788,11 +4793,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[14:15], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cselect_b64 exec, s[14:15], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4834,11 +4839,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4877,11 +4882,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4921,11 +4926,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 @@ -4963,10 +4968,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start @@ -5007,20 +5013,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB15_3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5028,11 +5035,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX940-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v9, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v9, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -5045,11 +5052,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: buffer_load_dword v7, v8, s[4:7], 0 offen +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff @@ -5057,7 +5064,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_max_f32_e32 v4, v4, v11 ; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -5067,7 +5074,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 @@ -5082,21 +5089,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_4 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB15_3 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v9, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5124,10 +5131,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 @@ -5169,22 +5177,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB15_3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5209,11 +5218,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 @@ -5244,22 +5253,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5267,11 +5276,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v9, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -5284,11 +5293,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: buffer_load_dword v7, v8, s[8:11], 0 offen +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff @@ -5296,14 +5305,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_max_f32_e32 v4, v4, v11 ; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] @@ -5318,32 +5327,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v9, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -5356,11 +5365,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff @@ -5368,14 +5377,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_max_f32_e32 v4, v4, v10 ; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 ; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec @@ -5391,32 +5400,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_4 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -5429,18 +5438,18 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f32_e32 v4, v4, v10 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 @@ -5448,7 +5457,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, v5 @@ -5465,31 +5474,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_4 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB15_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -5501,11 +5510,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 @@ -5513,12 +5522,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 @@ -5535,21 +5544,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB15_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5557,10 +5566,10 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -5572,11 +5581,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 @@ -5584,12 +5593,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_max_f32_e32 v4, v4, v10 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 @@ -5606,21 +5615,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_4 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB15_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5663,11 +5672,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5694,10 +5703,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5726,11 +5735,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5762,10 +5771,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5794,10 +5803,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5827,10 +5836,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5864,10 +5873,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5897,27 +5906,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5947,29 +5956,28 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6005,11 +6013,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6034,11 +6042,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6065,11 +6073,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6100,10 +6108,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6130,11 +6138,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6162,11 +6170,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6198,11 +6206,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6232,27 +6240,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6280,31 +6288,30 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6334,11 +6341,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start @@ -6367,20 +6375,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB18_3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6400,11 +6409,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start @@ -6429,19 +6438,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB18_3 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -6463,11 +6472,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start @@ -6495,21 +6505,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6529,12 +6540,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 @@ -6558,22 +6569,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB18_3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6593,11 +6604,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start @@ -6620,19 +6631,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6652,11 +6663,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start @@ -6680,19 +6691,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB18_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6712,11 +6723,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 @@ -6744,26 +6755,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_4 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB18_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -6775,19 +6786,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 @@ -6798,15 +6809,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v10 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v11 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v9 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v10 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -6818,22 +6829,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB18_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -6841,7 +6852,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -6853,19 +6864,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v10, v5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 @@ -6874,18 +6885,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v10 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v11 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v10 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -6897,25 +6907,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB18_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6973,11 +6982,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX12-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX12-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7019,10 +7028,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX940-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7070,12 +7079,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX11-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7121,10 +7130,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7169,10 +7178,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7218,10 +7227,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7268,10 +7277,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7311,13 +7320,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7358,13 +7367,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB19_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -7415,11 +7424,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v5 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX12-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX12-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7459,11 +7468,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX940-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7507,12 +7516,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX11-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7557,10 +7566,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7603,11 +7612,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7651,11 +7660,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7700,11 +7709,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7723,19 +7732,19 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 @@ -7744,13 +7753,13 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7769,20 +7778,20 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 @@ -7791,13 +7800,13 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB20_1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -7828,11 +7837,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 @@ -7876,20 +7886,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7909,11 +7920,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff @@ -7955,19 +7966,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX940-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB21_3 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7989,15 +8000,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 @@ -8038,22 +8048,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8073,12 +8083,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start @@ -8116,22 +8126,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB21_3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8151,11 +8161,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff @@ -8194,19 +8204,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8226,11 +8236,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff @@ -8270,19 +8280,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB21_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -8302,11 +8312,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 @@ -8347,19 +8357,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB21_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8378,11 +8388,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8394,19 +8404,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v9 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v9 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v10 +; GFX7-NEXT: v_alignbit_b32 v6, v5, v6, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v10 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_alignbit_b32 v7, v4, v7, 16 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v6, v7 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -8418,21 +8428,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v8, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB21_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v7 ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8452,11 +8462,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8468,19 +8478,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GFX6-NEXT: v_max_f32_e32 v4, v4, v9 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v9 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v10 +; GFX6-NEXT: v_alignbit_b32 v6, v5, v6, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_alignbit_b32 v7, v4, v7, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v6, v7 ; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -8492,24 +8503,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_4 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v8, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB21_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v7 ; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -8559,10 +8570,10 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -8619,10 +8630,10 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -8652,10 +8663,10 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -8685,10 +8696,10 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 91adbfa5599761..153c61cefbbbb9 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -52,10 +52,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -110,10 +110,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -143,10 +143,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -176,10 +176,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -250,11 +250,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -307,11 +307,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -339,11 +339,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -371,11 +371,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: @@ -433,12 +433,13 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB2_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -460,11 +461,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX940-NEXT: .LBB2_3: ; %atomicrmw.start @@ -489,19 +490,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_4 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB2_3 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -523,12 +524,13 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_min_f32 v5, v4, s[4:7], 0 offen offset:1024 glc +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB2_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: buffer_gl1_inv @@ -551,13 +553,13 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_fmin v5, v4, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: buffer_gl1_inv @@ -580,11 +582,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start @@ -607,19 +609,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_4 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -639,11 +641,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_max_f32_e32 v8, v5, v5 ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start @@ -667,19 +669,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_4 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB2_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -699,11 +701,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v5 ; GFX8-NEXT: .LBB2_3: ; %atomicrmw.start @@ -727,19 +729,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_4 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB2_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -758,12 +760,12 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_fmin v5, v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, v5 ; GFX7-NEXT: buffer_wbinvl1 @@ -784,12 +786,12 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_fmin v5, v4, s[8:11], 0 offen offset:1024 glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB2_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v5 ; GFX6-NEXT: buffer_wbinvl1 @@ -839,10 +841,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: @@ -870,11 +872,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: @@ -906,10 +908,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: @@ -938,10 +940,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: @@ -971,10 +973,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: @@ -1004,10 +1006,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: @@ -1037,10 +1039,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: @@ -1071,10 +1073,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB3_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 @@ -1121,10 +1123,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1179,10 +1181,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1212,10 +1214,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1245,10 +1247,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1318,11 +1320,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1346,6 +1348,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1363,11 +1366,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1429,10 +1432,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1466,10 +1469,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1534,11 +1537,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1561,6 +1564,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1577,11 +1581,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1640,11 +1644,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX908-NEXT: v_mov_b32_e32 v2, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1675,11 +1679,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, v7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1739,11 +1743,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start @@ -1772,20 +1777,21 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_4 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] ; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB7_3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB7_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -1807,12 +1813,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, v6 ; GFX940-NEXT: v_mov_b32_e32 v1, v7 @@ -1837,14 +1843,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start @@ -1872,21 +1878,22 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] ; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB7_3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -1905,13 +1912,13 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 @@ -1936,12 +1943,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -1968,11 +1975,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start @@ -1998,20 +2005,20 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_4 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] ; GFX908-NEXT: v_mov_b32_e32 v14, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v13, v0 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB7_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2034,11 +2041,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start @@ -2064,20 +2071,20 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_4 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] ; GFX8-NEXT: v_mov_b32_e32 v14, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB7_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2095,12 +2102,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, v5 ; GFX7-NEXT: v_mov_b32_e32 v1, v6 @@ -2122,12 +2129,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v5 ; GFX6-NEXT: v_mov_b32_e32 v1, v6 @@ -2171,11 +2178,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: @@ -2199,6 +2206,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2216,11 +2224,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: @@ -2256,10 +2264,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: @@ -2290,10 +2298,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: @@ -2327,10 +2335,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: @@ -2364,10 +2372,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: @@ -2401,10 +2409,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: @@ -2439,10 +2447,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 @@ -2482,11 +2490,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2510,6 +2518,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2527,11 +2536,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2593,10 +2602,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2630,10 +2639,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2682,15 +2691,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s5, s5 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2703,7 +2711,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2711,12 +2719,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX12-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2748,11 +2756,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2762,15 +2770,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s5, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2784,7 +2791,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2792,12 +2799,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX11-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2815,9 +2822,9 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_not_b32 s5, s5 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2826,7 +2833,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc @@ -2835,11 +2842,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2874,11 +2881,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2914,11 +2921,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2955,11 +2962,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2998,11 +3005,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3043,11 +3050,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -3068,15 +3075,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s5, s5 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3089,7 +3095,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -3097,12 +3103,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX12-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3133,11 +3139,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3146,15 +3152,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s5, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3168,7 +3173,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3176,12 +3181,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX11-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3198,9 +3203,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_not_b32 s5, s5 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3209,7 +3214,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc @@ -3218,11 +3223,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3256,11 +3261,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3295,11 +3300,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3335,11 +3340,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3377,11 +3382,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3420,11 +3425,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -3462,10 +3467,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start @@ -3500,20 +3506,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_4 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB12_3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3521,11 +3528,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX940-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v9, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v9, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 @@ -3538,21 +3545,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: buffer_load_dword v7, v8, s[4:7], 0 offen +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v9, v7 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX940-NEXT: v_min_f16_e32 v4, v4, v11 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v9, v4 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] @@ -3569,21 +3576,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_4 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB12_3 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v9, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3611,10 +3618,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start @@ -3648,21 +3656,22 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB12_3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3687,11 +3696,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 @@ -3718,22 +3727,22 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB12_3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3741,11 +3750,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v9, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 @@ -3758,21 +3767,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: buffer_load_dword v7, v8, s[8:11], 0 offen +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v9, v7 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_min_f16_e32 v4, v4, v11 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v9, v4 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] @@ -3787,32 +3796,32 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v9, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 @@ -3825,21 +3834,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX908-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec @@ -3855,32 +3864,32 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_4 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB12_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 @@ -3893,21 +3902,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX8-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, v5 @@ -3924,31 +3933,31 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_4 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB12_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 @@ -3960,11 +3969,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 @@ -3972,13 +3981,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_min_f32_e32 v4, v4, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 @@ -3993,21 +4002,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB12_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4015,10 +4024,10 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 @@ -4030,11 +4039,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 @@ -4042,13 +4051,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_min_f32_e32 v4, v4, v10 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 @@ -4063,21 +4072,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_4 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB12_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4101,15 +4110,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s5, s5 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4129,19 +4137,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX12-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4179,11 +4187,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4193,15 +4201,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s5, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4221,7 +4228,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc @@ -4230,12 +4237,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX11-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX11-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4253,9 +4260,9 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_not_b32 s5, s5 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4268,7 +4275,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc @@ -4277,11 +4284,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4321,11 +4328,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[14:15], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[14:15], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4366,11 +4373,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[14:15], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[14:15], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4413,11 +4420,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4457,11 +4464,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4503,11 +4510,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -4528,15 +4535,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_addk_co_i32 s6, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s5, s5 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4556,19 +4562,19 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX12-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4605,11 +4611,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4618,15 +4624,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: s_addk_i32 s6, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s5, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4646,7 +4651,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc @@ -4655,12 +4660,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s7, exec_lo, s6 +; GFX11-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4677,9 +4682,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_not_b32 s5, s5 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4692,7 +4697,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s5, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc @@ -4701,11 +4706,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s6 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4744,11 +4749,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[14:15], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[14:15], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4788,11 +4793,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[14:15], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cselect_b64 exec, s[14:15], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4834,11 +4839,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4877,11 +4882,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4921,11 +4926,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[12:13], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_cselect_b64 exec, s[12:13], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 @@ -4963,10 +4968,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start @@ -5007,20 +5013,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB15_3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5028,11 +5035,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX940-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v9, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v9, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -5045,11 +5052,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: buffer_load_dword v7, v8, s[4:7], 0 offen +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff @@ -5057,7 +5064,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_min_f32_e32 v4, v4, v11 ; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -5067,7 +5074,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 @@ -5082,21 +5089,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_4 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB15_3 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v9, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5124,10 +5131,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 @@ -5169,22 +5177,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB15_3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5209,11 +5218,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 @@ -5244,22 +5253,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5267,11 +5276,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v9, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -5284,11 +5293,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: buffer_load_dword v7, v8, s[8:11], 0 offen +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff @@ -5296,14 +5305,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_min_f32_e32 v4, v4, v11 ; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] @@ -5318,32 +5327,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v9, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -5356,11 +5365,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff @@ -5368,14 +5377,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_min_f32_e32 v4, v4, v10 ; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 ; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec @@ -5391,32 +5400,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_4 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -5429,18 +5438,18 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f32_e32 v4, v4, v10 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 @@ -5448,7 +5457,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, v5 @@ -5465,31 +5474,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_4 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB15_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -5501,11 +5510,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 @@ -5513,12 +5522,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 @@ -5535,21 +5544,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB15_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5557,10 +5566,10 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v7, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 @@ -5572,11 +5581,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 @@ -5584,12 +5593,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_min_f32_e32 v4, v4, v10 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 @@ -5606,21 +5615,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_4 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB15_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5663,11 +5672,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5694,10 +5703,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5726,11 +5735,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5762,10 +5771,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5794,10 +5803,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5827,10 +5836,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5864,10 +5873,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5897,27 +5906,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5947,29 +5956,28 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6005,11 +6013,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX12-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX12-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6034,11 +6042,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6065,11 +6073,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6100,10 +6108,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6130,11 +6138,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6162,11 +6170,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6198,11 +6206,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6232,27 +6240,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6280,31 +6288,30 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6334,11 +6341,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start @@ -6367,20 +6375,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB18_3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6400,11 +6409,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start @@ -6429,19 +6438,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB18_3 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -6463,11 +6472,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start @@ -6495,21 +6505,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6529,12 +6540,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 @@ -6558,22 +6569,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB18_3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6593,11 +6604,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start @@ -6620,19 +6631,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6652,11 +6663,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start @@ -6680,19 +6691,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB18_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6712,11 +6723,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 @@ -6744,26 +6755,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_4 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB18_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -6775,19 +6786,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 @@ -6798,15 +6809,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v10 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v11 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v9 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v10 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -6818,22 +6829,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB18_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -6841,7 +6852,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -6853,19 +6864,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v10, v5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 @@ -6874,18 +6885,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v10 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v11 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v10 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -6897,25 +6907,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB18_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6973,11 +6982,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX12-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX12-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7019,10 +7028,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX940-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7070,12 +7079,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX11-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7121,10 +7130,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7169,10 +7178,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7218,10 +7227,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7268,10 +7277,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7311,13 +7320,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7358,13 +7367,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB19_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -7415,11 +7424,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v5 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX12-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX12-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7459,11 +7468,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX940-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7507,12 +7516,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s4, exec_lo, s5 +; GFX11-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7557,10 +7566,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7603,11 +7612,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7651,11 +7660,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7700,11 +7709,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7723,19 +7732,19 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 @@ -7744,13 +7753,13 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7769,20 +7778,20 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 @@ -7791,13 +7800,13 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB20_1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -7828,11 +7837,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 @@ -7876,20 +7886,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7909,11 +7920,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff @@ -7955,19 +7966,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX940-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB21_3 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7989,15 +8000,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 @@ -8038,22 +8048,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8073,12 +8083,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start @@ -8116,22 +8126,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB21_3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8151,11 +8161,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff @@ -8194,19 +8204,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8226,11 +8236,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff @@ -8270,19 +8280,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB21_3 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -8302,11 +8312,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 @@ -8347,19 +8357,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB21_3 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8378,11 +8388,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8394,19 +8404,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v9 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v9 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v10 +; GFX7-NEXT: v_alignbit_b32 v6, v5, v6, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v10 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_alignbit_b32 v7, v4, v7, 16 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v6, v7 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -8418,21 +8428,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v8, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB21_3 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v7 ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8452,11 +8462,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8468,19 +8478,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GFX6-NEXT: v_min_f32_e32 v4, v4, v9 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v9 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v10 +; GFX6-NEXT: v_alignbit_b32 v6, v5, v6, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_alignbit_b32 v7, v4, v7, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v6, v7 ; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -8492,24 +8503,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_4 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v8, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB21_3 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v7 ; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -8559,10 +8570,10 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX940-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX940-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -8619,10 +8630,10 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -8652,10 +8663,10 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -8685,10 +8696,10 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll index 82808cd3092270..4b6c36b4ddd1e6 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=ISA ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -stop-before=si-fix-sgpr-copies < %s | FileCheck %s -check-prefix=MIR @@ -30,78 +29,17 @@ define void @f(i32 %arg, ptr %ptr) { ; ISA-NEXT: v_mov_b32_e32 v7, v6 ; ISA-NEXT: s_and_b32 s5, exec_lo, vcc_lo ; ISA-NEXT: s_or_b32 s4, s5, s4 +; ISA-NEXT: s_andn2_b32 s5, exec_lo, s4 ; ISA-NEXT: v_add_f32_e32 v6, v7, v0 ; ISA-NEXT: v_add_f32_e64 v6, v6, |v3| ; ISA-NEXT: v_add_f32_e32 v6, v6, v4 ; ISA-NEXT: v_add_f32_e32 v6, v6, v5 -; ISA-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; ISA-NEXT: s_cbranch_execnz .LBB0_1 +; ISA-NEXT: s_cselect_b32 exec_lo, s5, s4 +; ISA-NEXT: s_cbranch_scc1 .LBB0_1 ; ISA-NEXT: ; %bb.2: ; %bb21 -; ISA-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; ISA-NEXT: flat_store_dword v[1:2], v7 ; ISA-NEXT: s_waitcnt lgkmcnt(0) ; ISA-NEXT: s_setpc_b64 s[30:31] - ; MIR-LABEL: name: f - ; MIR: bb.0.bb: - ; MIR-NEXT: successors: %bb.1(0x80000000) - ; MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; MIR-NEXT: {{ $}} - ; MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MIR-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; MIR-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; MIR-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[S_MOV_B64_]], 0, 0 :: (invariant load (s64) from `ptr addrspace(4) null`, align 4294967296, addrspace 4) - ; MIR-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 - ; MIR-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; MIR-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc - ; MIR-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_]], [[COPY5]], implicit-def dead $scc - ; MIR-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; MIR-NEXT: S_CMP_LG_U32 [[COPY5]], [[S_MOV_B32_1]], implicit-def $scc - ; MIR-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc - ; MIR-NEXT: $scc = COPY [[COPY6]] - ; MIR-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_1]], implicit $scc - ; MIR-NEXT: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_]] - ; MIR-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 - ; MIR-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 - ; MIR-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_2]] - ; MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_3]], 0, [[COPY8]], [[COPY6]], implicit $exec - ; MIR-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]] - ; MIR-NEXT: $scc = COPY [[COPY6]] - ; MIR-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_1]], implicit $scc - ; MIR-NEXT: [[V_CVT_F32_UBYTE0_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e64 killed [[S_CSELECT_B32_1]], 0, 0, implicit $exec - ; MIR-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]] - ; MIR-NEXT: $scc = COPY [[COPY6]] - ; MIR-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY4]], [[S_MOV_B32_1]], implicit $scc - ; MIR-NEXT: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]] - ; MIR-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_]], implicit $exec - ; MIR-NEXT: [[COPY12:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]] - ; MIR-NEXT: {{ $}} - ; MIR-NEXT: bb.1.bb14: - ; MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; MIR-NEXT: {{ $}} - ; MIR-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %7, %bb.1 - ; MIR-NEXT: [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_3]], %bb.0, %8, %bb.1 - ; MIR-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY12]] - ; MIR-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY13]], [[PHI]], implicit-def dead $scc - ; MIR-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY9]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 2, [[COPY7]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY10]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]] - ; MIR-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; MIR-NEXT: S_BRANCH %bb.2 - ; MIR-NEXT: {{ $}} - ; MIR-NEXT: bb.2.bb21: - ; MIR-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1 - ; MIR-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.1 - ; MIR-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; MIR-NEXT: FLAT_STORE_DWORD [[COPY3]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr) - ; MIR-NEXT: SI_RETURN bb: %i = load <2 x i32>, ptr addrspace(4) null, align 4294967296 %i1 = extractelement <2 x i32> %i, i64 1 @@ -134,3 +72,5 @@ bb21: } declare float @llvm.fabs.f32(float) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; MIR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index 5bbea7ecf3f2d5..daaa1376d84eb6 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -12,9 +12,10 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v9 @@ -122,9 +123,12 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v0, v2, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: .LBB0_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; GFX9-NEXT: s_cbranch_execz .LBB0_4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 @@ -146,8 +150,8 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v3, vcc -; GFX9-NEXT: .LBB0_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB0_4: ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -163,9 +167,10 @@ define i64 @udiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 @@ -258,9 +263,12 @@ define i64 @udiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: .LBB1_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; GFX9-NEXT: s_cbranch_execz .LBB1_4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 @@ -282,8 +290,8 @@ define i64 @udiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v3, vcc -; GFX9-NEXT: .LBB1_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB1_4: ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -299,9 +307,10 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_xor_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -407,9 +416,12 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v5, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: .LBB2_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[8:9] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[8:9], exec +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_cmov_b64 exec, s[8:9] +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 @@ -429,8 +441,8 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_sub_u32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -446,9 +458,10 @@ define i64 @urem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_xor_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 @@ -540,9 +553,12 @@ define i64 @urem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: .LBB3_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[8:9] -; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[8:9], exec +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_cmov_b64 exec, s[8:9] +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 @@ -562,8 +578,8 @@ define i64 @urem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_sub_u32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -703,9 +719,10 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_xor_b64 s[10:11], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v9 @@ -824,9 +841,12 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v7, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX9-NEXT: .LBB8_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_cmov_b64 exec, s[10:11] +; GFX9-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 @@ -851,8 +871,8 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc -; GFX9-NEXT: .LBB8_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB8_4: ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: v_mov_b32_e32 v2, v6 @@ -874,9 +894,10 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_xor_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 @@ -976,9 +997,12 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: .LBB9_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[8:9] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[8:9], exec +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_cmov_b64 exec, s[8:9] +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 @@ -1003,8 +1027,8 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc -; GFX9-NEXT: .LBB9_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB9_4: ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: v_mov_b32_e32 v2, v6 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll index fdae1696a5a492..6dc4dd99f13276 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -74,15 +74,17 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], exec +; GFX7-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7-NEXT: s_cbranch_execz .LBB0_2 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 28, v2 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX7-NEXT: flat_load_dword v4, v[2:3] -; GFX7-NEXT: .LBB0_2: ; %endif ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB0_2: ; %endif ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x3d08fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -94,15 +96,17 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_2 +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; %if ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v4, v[2:3] -; GFX8-NEXT: .LBB0_2: ; %endif ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB0_2: ; %endif ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3d08fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -114,13 +118,15 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: flat_load_dword v4, v[2:3] offset:28 -; GFX9-NEXT: .LBB0_2: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB0_2: ; %endif ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3d0000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -133,12 +139,14 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB0_2 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: flat_load_dword v4, v[2:3] offset:28 -; GFX10-NEXT: .LBB0_2: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB0_2: ; %endif ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3d0800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -228,18 +236,20 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; GFX7-LABEL: test_sink_noop_addrspacecast_flat_to_global_i32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], exec +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX7-NEXT: s_cbranch_execz .LBB1_2 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 offset:28 -; GFX7-NEXT: .LBB1_2: ; %endif ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: .LBB1_2: ; %endif ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x3d08fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -251,15 +261,17 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_2 +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; %if ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v4, v[2:3] -; GFX8-NEXT: .LBB1_2: ; %endif ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB1_2: ; %endif ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3d08fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -271,13 +283,15 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:28 -; GFX9-NEXT: .LBB1_2: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB1_2: ; %endif ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3d0000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -290,12 +304,14 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB1_2 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:28 -; GFX10-NEXT: .LBB1_2: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB1_2: ; %endif ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3d0800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -341,18 +357,20 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in ; GFX7-LABEL: test_sink_noop_addrspacecast_flat_to_constant_i32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], exec +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX7-NEXT: s_cbranch_execz .LBB2_2 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 offset:28 -; GFX7-NEXT: .LBB2_2: ; %endif ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: .LBB2_2: ; %endif ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x3d08fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -364,15 +382,17 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX8-NEXT: ; %bb.1: ; %if ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v4, v[2:3] -; GFX8-NEXT: .LBB2_2: ; %endif ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB2_2: ; %endif ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3d08fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -384,13 +404,15 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:28 -; GFX9-NEXT: .LBB2_2: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB2_2: ; %endif ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3d0000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -403,12 +425,14 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB2_2 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:28 -; GFX10-NEXT: .LBB2_2: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB2_2: ; %endif ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3d0800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -509,17 +533,19 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; GFX7-LABEL: test_sink_flat_small_max_flat_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v5, -1, 0 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v4, -1, 0 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], exec +; GFX7-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7-NEXT: s_cbranch_execz .LBB3_2 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xfff, v2 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX7-NEXT: flat_load_sbyte v4, v[2:3] -; GFX7-NEXT: .LBB3_2: ; %endif ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB3_2: ; %endif ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x1000, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -530,17 +556,19 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; GFX8-LABEL: test_sink_flat_small_max_flat_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB3_2 +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX8-NEXT: ; %bb.1: ; %if ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xfff, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_sbyte v4, v[2:3] -; GFX8-NEXT: .LBB3_2: ; %endif ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB3_2: ; %endif ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x1000, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -551,15 +579,17 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; GFX9-LABEL: test_sink_flat_small_max_flat_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: flat_load_sbyte v4, v[2:3] offset:4095 -; GFX9-NEXT: .LBB3_2: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB3_2: ; %endif ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -571,16 +601,18 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0 +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB3_2 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: flat_load_sbyte v4, v[2:3] offset:2047 -; GFX10-NEXT: .LBB3_2: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB3_2: ; %endif ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -630,17 +662,19 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 { ; GFX7-LABEL: test_sink_flat_small_max_plus_1_flat_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v5, -1, 0 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v4, -1, 0 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], exec +; GFX7-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7-NEXT: s_cbranch_execz .LBB4_2 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x1000, v2 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX7-NEXT: flat_load_sbyte v4, v[2:3] -; GFX7-NEXT: .LBB4_2: ; %endif ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB4_2: ; %endif ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x61a7c, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -651,17 +685,19 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 { ; GFX8-LABEL: test_sink_flat_small_max_plus_1_flat_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB4_2 +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; %if ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x1000, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_sbyte v4, v[2:3] -; GFX8-NEXT: .LBB4_2: ; %endif ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB4_2: ; %endif ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x61a7c, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -672,17 +708,19 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 { ; GFX9-LABEL: test_sink_flat_small_max_plus_1_flat_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x1000, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: flat_load_sbyte v4, v[2:3] -; GFX9-NEXT: .LBB4_2: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB4_2: ; %endif ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x61000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -694,16 +732,18 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0 +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB4_2 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: flat_load_sbyte v4, v[2:3] -; GFX10-NEXT: .LBB4_2: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB4_2: ; %endif ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x61800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -753,17 +793,19 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 { ; GFX7-LABEL: test_sinkable_flat_reg_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v7, -1, 0 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v6, -1, 0 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX7-NEXT: s_mov_b64 s[4:5], exec +; GFX7-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7-NEXT: v_mov_b32_e32 v6, 0 -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7-NEXT: s_cbranch_execz .LBB5_2 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GFX7-NEXT: flat_load_sbyte v6, v[2:3] -; GFX7-NEXT: .LBB5_2: ; %endif ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB5_2: ; %endif ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x1000, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -774,17 +816,19 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 { ; GFX8-LABEL: test_sinkable_flat_reg_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mov_b32_e32 v6, 0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB5_2 +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX8-NEXT: ; %bb.1: ; %if ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GFX8-NEXT: flat_load_sbyte v6, v[2:3] -; GFX8-NEXT: .LBB5_2: ; %endif ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB5_2: ; %endif ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x1000, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -795,17 +839,19 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 { ; GFX9-LABEL: test_sinkable_flat_reg_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; GFX9-NEXT: flat_load_sbyte v6, v[2:3] -; GFX9-NEXT: .LBB5_2: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB5_2: ; %endif ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -817,16 +863,18 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0 +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB5_2 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX10-NEXT: flat_load_sbyte v6, v[2:3] -; GFX10-NEXT: .LBB5_2: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB5_2: ; %endif ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll index b23249570faa7d..17ea2c63537f7a 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -passes='require,function(codegenprepare)' -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll index 7587b81e9936da..65fb6fa2a00496 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -29,9 +29,11 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %if ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 2.0 @@ -41,8 +43,8 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: global_load_dword v0, v[0:1], off glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: .LBB0_2: ; %endif ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB0_2: ; %endif ; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:2300 diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll index 3216e71e6221ae..144a1cb25bd07a 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll @@ -208,27 +208,29 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) { ; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 1, v0 ; DAGISEL-ASM-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; DAGISEL-ASM-NEXT: s_xor_b64 s[4:5], vcc, -1 +; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], exec ; DAGISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; DAGISEL-ASM-NEXT: s_and_saveexec_b64 s[4:5], vcc +; DAGISEL-ASM-NEXT: s_mov_b64 s[6:7], 0 +; DAGISEL-ASM-NEXT: s_cmp_lg_u64 vcc, 0 +; DAGISEL-ASM-NEXT: s_cmov_b64 exec, vcc +; DAGISEL-ASM-NEXT: s_cbranch_scc0 .LBB7_2 ; DAGISEL-ASM-NEXT: ; %bb.1: ; %then ; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; DAGISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split -; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5] -; DAGISEL-ASM-NEXT: s_xor_b64 s[6:7], vcc, -1 -; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], 0 -; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base -; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 -; DAGISEL-ASM-NEXT: .LBB7_3: ; %finally +; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[8:9] +; DAGISEL-ASM-NEXT: .LBB7_2: ; %finally ; DAGISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1 -; DAGISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[6:7] -; DAGISEL-ASM-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; DAGISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[4:5] +; DAGISEL-ASM-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base ; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 +; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 +; DAGISEL-ASM-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] ; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2 ; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) -; DAGISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[4:5] -; DAGISEL-ASM-NEXT: s_cbranch_execnz .LBB7_3 -; DAGISEL-ASM-NEXT: ; %bb.4: ; %end -; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5] +; DAGISEL-ASM-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; DAGISEL-ASM-NEXT: s_cbranch_scc1 .LBB7_2 +; DAGISEL-ASM-NEXT: ; %bb.3: ; %end ; DAGISEL-ASM-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31] ; @@ -237,28 +239,30 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) { ; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-ASM-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GISEL-ASM-NEXT: s_lshr_b32 s6, s32, 6 ; GISEL-ASM-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-ASM-NEXT: v_mov_b32_e32 v0, s6 -; GISEL-ASM-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-ASM-NEXT: s_lshr_b32 s10, s32, 6 +; GISEL-ASM-NEXT: s_mov_b64 s[8:9], exec +; GISEL-ASM-NEXT: s_mov_b64 s[6:7], 0 +; GISEL-ASM-NEXT: v_mov_b32_e32 v0, s10 +; GISEL-ASM-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-ASM-NEXT: s_cmov_b64 exec, vcc +; GISEL-ASM-NEXT: s_cbranch_scc0 .LBB7_2 ; GISEL-ASM-NEXT: ; %bb.1: ; %then ; GISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split -; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base -; GISEL-ASM-NEXT: s_mov_b64 s[6:7], 0 -; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 -; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 -; GISEL-ASM-NEXT: .LBB7_3: ; %finally +; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-ASM-NEXT: .LBB7_2: ; %finally ; GISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GISEL-ASM-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base +; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 +; GISEL-ASM-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] ; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2 ; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) -; GISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GISEL-ASM-NEXT: s_cbranch_execnz .LBB7_3 -; GISEL-ASM-NEXT: ; %bb.4: ; %end -; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-ASM-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GISEL-ASM-NEXT: s_cbranch_scc1 .LBB7_2 +; GISEL-ASM-NEXT: ; %bb.3: ; %end ; GISEL-ASM-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-ASM-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 75f5eda608e80a..23c4ab59d14f4d 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -10,19 +10,23 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: simple_nested_if: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_3 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_and_b64 s[0:1], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_and_b64 s[8:9], vcc, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64 -; GCN-NEXT: s_and_b64 exec, exec, vcc -; GCN-NEXT: s_cbranch_execz .LBB0_3 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.inner.then ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 @@ -32,8 +36,10 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, 1 ; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:4 -; GCN-NEXT: .LBB0_3: ; %bb.outer.end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: .LBB0_3: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB0_4: ; %bb.outer.end ; GCN-NEXT: v_mov_b32_e32 v0, 3 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 m0, -1 @@ -60,25 +66,26 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 ; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB0_4 -; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB0_1 +; GCN-O0-NEXT: s_branch .LBB0_4 +; GCN-O0-NEXT: .LBB0_1: ; %bb.outer.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(1) ; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 @@ -93,58 +100,57 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0 ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 4 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB0_3 -; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB0_2 +; GCN-O0-NEXT: s_branch .LBB0_3 +; GCN-O0-NEXT: .LBB0_2: ; %bb.inner.then +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 ; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: s_mov_b32 s2, 2 ; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2 -; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_mov_b32 s6, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB0_3: ; %Flow ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB0_4: ; %bb.outer.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 @@ -177,36 +183,40 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-LABEL: uncollapsable_nested_if: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_4 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_and_b64 s[0:1], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_4 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_and_b64 s[8:9], vcc, -1 ; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_3 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_3 ; GCN-NEXT: ; %bb.2: ; %bb.inner.then ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:4 -; GCN-NEXT: .LBB1_3: ; %bb.inner.end ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: .LBB1_3: ; %bb.inner.end ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 2 ; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 -; GCN-NEXT: .LBB1_4: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB1_4: ; %bb.outer.end ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 3 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -234,25 +244,26 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 ; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB1_3 -; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB1_1 +; GCN-O0-NEXT: s_branch .LBB1_3 +; GCN-O0-NEXT: .LBB1_1: ; %bb.outer.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(1) ; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 @@ -267,80 +278,75 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0 ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 4 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB1_4 -; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB1_2 +; GCN-O0-NEXT: s_branch .LBB1_4 +; GCN-O0-NEXT: .LBB1_2: ; %bb.inner.then +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 ; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: s_mov_b32 s2, 2 ; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2 -; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_mov_b32 s6, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB1_4 ; GCN-O0-NEXT: .LBB1_3: ; %Flow -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB1_5 ; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s2, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s3, v0, 5 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 ; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0 -; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_mov_b32 s6, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB1_3 ; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 @@ -381,45 +387,52 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: nested_if_if_else: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GCN-NEXT: s_cbranch_execz .LBB2_5 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB2_6 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-NEXT: v_mov_b32_e32 v4, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v1 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GCN-NEXT: s_cbranch_execz .LBB2_3 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: s_xor_b64 s[2:3], vcc, exec +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], s0, v1 +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 +; GCN-NEXT: v_addc_u32_e64 v1, s[0:1], 0, v3, s[0:1] +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB2_3 ; GCN-NEXT: ; %bb.2: ; %bb.else -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mov_b32_e32 v0, 2 -; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:8 -; GCN-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: buffer_store_dword v3, v[0:1], s[8:11], 0 addr64 offset:8 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: .LBB2_3: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GCN-NEXT: s_cbranch_execz .LBB2_5 +; GCN-NEXT: s_xor_b64 s[0:1], s[2:3], exec +; GCN-NEXT: s_and_b64 s[6:7], s[2:3], -1 +; GCN-NEXT: s_cmov_b64 exec, s[2:3] +; GCN-NEXT: s_cbranch_scc0 .LBB2_5 ; GCN-NEXT: ; %bb.4: ; %bb.then -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:4 -; GCN-NEXT: .LBB2_5: ; %bb.outer.end -; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 ; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, 1 +; GCN-NEXT: buffer_store_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-NEXT: .LBB2_5: ; %Flow7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB2_6: ; %bb.outer.end ; GCN-NEXT: v_mov_b32_e32 v0, 3 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v2, v0 @@ -435,9 +448,9 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 ; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1] @@ -463,120 +476,120 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB2_6 -; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB2_1 +; GCN-O0-NEXT: s_branch .LBB2_6 +; GCN-O0-NEXT: .LBB2_1: ; %bb.outer.then +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[2:3], exec -; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] +; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], exec ; GCN-O0-NEXT: v_writelane_b32 v0, s2, 4 ; GCN-O0-NEXT: v_writelane_b32 v0, s3, 5 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB2_2 -; GCN-O0-NEXT: s_branch .LBB2_4 +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB2_4 ; GCN-O0-NEXT: .LBB2_2: ; %Flow -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 -; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] -; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 6 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 7 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB2_5 -; GCN-O0-NEXT: ; %bb.3: ; %bb.then -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB2_3 +; GCN-O0-NEXT: s_branch .LBB2_5 +; GCN-O0-NEXT: .LBB2_3: ; %bb.then +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 ; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: s_mov_b32 s2, 2 ; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2 -; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_mov_b32 s6, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB2_5 ; GCN-O0-NEXT: .LBB2_4: ; %bb.else -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 ; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0 -; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_mov_b32 s6, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB2_2 ; GCN-O0-NEXT: .LBB2_5: ; %Flow1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB2_6: ; %bb.outer.end -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 @@ -624,48 +637,54 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, exec +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 ; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[0:1] -; GCN-NEXT: s_cbranch_execz .LBB3_4 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB3_4 ; GCN-NEXT: ; %bb.1: ; %bb.outer.else +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: v_mov_b32_e32 v3, 3 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_b64 s[8:9], vcc, -1 ; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:12 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_cbranch_execz .LBB3_3 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB3_3 ; GCN-NEXT: ; %bb.2: ; %bb.inner.then2 -; GCN-NEXT: s_mov_b32 s10, 0 -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s8, s10 -; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, 4 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:16 +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB3_3: ; %Flow -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB3_4: ; %Flow2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_8 +; GCN-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_and_b64 s[0:1], s[4:5], -1 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB3_8 ; GCN-NEXT: ; %bb.5: ; %bb.outer.then +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, 1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_and_b64 s[8:9], vcc, -1 ; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:4 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GCN-NEXT: s_cbranch_execz .LBB3_7 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB3_7 ; GCN-NEXT: ; %bb.6: ; %bb.inner.then ; GCN-NEXT: v_mov_b32_e32 v0, 2 ; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB3_7: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB3_8: ; %bb.outer.end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 3 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -719,17 +738,15 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 ; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[2:3], exec -; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] +; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], exec ; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB3_1 -; GCN-O0-NEXT: s_branch .LBB3_4 +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB3_4 ; GCN-O0-NEXT: .LBB3_1: ; %Flow2 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) @@ -738,16 +755,17 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] -; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB3_8 -; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB3_2 +; GCN-O0-NEXT: s_branch .LBB3_8 +; GCN-O0-NEXT: .LBB3_2: ; %bb.outer.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload @@ -767,32 +785,39 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 offset:4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 4 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB3_7 -; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB3_3 +; GCN-O0-NEXT: s_branch .LBB3_7 +; GCN-O0-NEXT: .LBB3_3: ; %bb.inner.then ; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_mov_b32 s4, s2 -; GCN-O0-NEXT: s_mov_b32 s5, s0 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: s_mov_b32 s4, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s6, 0 +; GCN-O0-NEXT: s_mov_b32 s2, s6 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b32 s4, s6 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB3_7 ; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 @@ -813,33 +838,21 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 offset:12 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 6 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 7 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB3_6 -; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2 +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB3_5 +; GCN-O0-NEXT: s_branch .LBB3_6 +; GCN-O0-NEXT: .LBB3_5: ; %bb.inner.then2 ; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_mov_b32 s4, s2 -; GCN-O0-NEXT: s_mov_b32 s5, s0 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v0, 4 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:16 -; GCN-O0-NEXT: .LBB3_6: ; %Flow ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload @@ -847,18 +860,28 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7 +; GCN-O0-NEXT: s_mov_b32 s4, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s6, 0 +; GCN-O0-NEXT: s_mov_b32 s2, s6 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b32 s4, s6 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: v_mov_b32_e32 v0, 4 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:16 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: s_branch .LBB3_1 -; GCN-O0-NEXT: .LBB3_7: ; %Flow1 +; GCN-O0-NEXT: .LBB3_6: ; %Flow ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: .LBB3_8: ; %bb.outer.end +; GCN-O0-NEXT: s_branch .LBB3_1 +; GCN-O0-NEXT: .LBB3_7: ; %Flow1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload @@ -867,6 +890,11 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: .LBB3_8: ; %bb.outer.end +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 @@ -911,8 +939,10 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-LABEL: s_endpgm_unsafe_barrier: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: s_mov_b64 s[0:1], exec +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %bb.then ; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 @@ -921,8 +951,8 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v1, v[0:1], s[4:7], 0 addr64 -; GCN-NEXT: .LBB4_2: ; %bb.end ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-NEXT: .LBB4_2: ; %bb.end ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_barrier ; GCN-NEXT: s_endpgm @@ -937,9 +967,9 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 ; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 @@ -947,48 +977,48 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 ; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB4_2 -; GCN-O0-NEXT: ; %bb.1: ; %bb.then -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB4_1 +; GCN-O0-NEXT: s_branch .LBB4_2 +; GCN-O0-NEXT: .LBB4_1: ; %bb.then ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 1 +; GCN-O0-NEXT: s_mov_b32 s6, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-O0-NEXT: v_ashrrev_i32_e64 v2, 31, v0 ; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 -; GCN-O0-NEXT: s_mov_b32 s4, 2 -; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[0:1], s4 +; GCN-O0-NEXT: s_mov_b32 s2, 2 +; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[0:1], s2 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB4_2: ; %bb.end -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_barrier ; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_endpgm @@ -1020,44 +1050,48 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: s_branch .LBB5_3 ; GCN-NEXT: .LBB5_1: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-NEXT: s_or_b64 exec, exec, s[14:15] ; GCN-NEXT: .LBB5_2: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[14:15] ; GCN-NEXT: s_and_b64 s[6:7], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13] +; GCN-NEXT: s_andn2_b64 s[10:11], exec, s[12:13] ; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: s_andn2_b64 exec, exec, s[12:13] -; GCN-NEXT: s_cbranch_execz .LBB5_7 +; GCN-NEXT: s_cselect_b64 exec, s[10:11], s[12:13] +; GCN-NEXT: s_cbranch_scc0 .LBB5_7 ; GCN-NEXT: .LBB5_3: ; %bb1 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_and_b64 s[10:11], exec, vcc ; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 +; GCN-NEXT: s_andn2_b64 s[10:11], exec, s[6:7] +; GCN-NEXT: s_cselect_b64 exec, s[10:11], s[6:7] +; GCN-NEXT: s_cbranch_scc1 .LBB5_3 ; GCN-NEXT: ; %bb.4: ; %bb2 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_mov_b32 s10, s8 ; GCN-NEXT: s_mov_b32 s11, s8 ; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_mov_b64 s[14:15], exec +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: s_and_saveexec_b64 s[14:15], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB5_2 ; GCN-NEXT: ; %bb.5: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GCN-NEXT: s_mov_b64 s[16:17], exec ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_and_b64 s[18:19], s[6:7], -1 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB5_1 +; GCN-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB5_1 ; GCN-NEXT: ; %bb.6: ; %bb8 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 ; GCN-NEXT: s_mov_b32 s9, s8 @@ -1065,9 +1099,9 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: s_or_b64 exec, exec, s[16:17] ; GCN-NEXT: s_branch .LBB5_1 ; GCN-NEXT: .LBB5_7: ; %bb12 -; GCN-NEXT: s_or_b64 exec, exec, s[12:13] ; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen @@ -1087,10 +1121,10 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] @@ -1099,61 +1133,56 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: v_writelane_b32 v0, s7, 1 ; GCN-O0-NEXT: v_writelane_b32 v0, s4, 2 ; GCN-O0-NEXT: v_writelane_b32 v0, s5, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: .LBB5_1: ; %bb1 ; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(1) ; GCN-O0-NEXT: v_readlane_b32 s8, v0, 2 ; GCN-O0-NEXT: v_readlane_b32 s9, v0, 3 -; GCN-O0-NEXT: v_readlane_b32 s6, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s7, v0, 1 -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 5 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b32 s4, 0x207 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 4 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 5 +; GCN-O0-NEXT: s_mov_b32 s6, 0x207 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, s4 -; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 6 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 7 -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 1 -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[6:7], v1, s6 +; GCN-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GCN-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB5_1 ; GCN-O0-NEXT: ; %bb.2: ; %bb2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 6 -; GCN-O0-NEXT: v_readlane_b32 s5, v0, 7 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b32 s6, 0 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v1, s6 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s6 -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 8 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 9 ; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: s_mov_b32 s8, s4 -; GCN-O0-NEXT: s_mov_b32 s9, s4 -; GCN-O0-NEXT: s_mov_b32 s10, s4 -; GCN-O0-NEXT: s_mov_b32 s11, s4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], v1, s4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, s4 +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 6 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GCN-O0-NEXT: s_mov_b32 s6, 0 +; GCN-O0-NEXT: s_mov_b32 s8, s6 +; GCN-O0-NEXT: s_mov_b32 s9, s6 +; GCN-O0-NEXT: s_mov_b32 s10, s6 +; GCN-O0-NEXT: s_mov_b32 s11, s6 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 ; GCN-O0-NEXT: v_mov_b32_e32 v3, s10 @@ -1163,31 +1192,32 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 10 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 11 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_mov_b64 s[6:7], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 8 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 9 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execz .LBB5_5 -; GCN-O0-NEXT: ; %bb.3: ; %bb4 +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB5_3 +; GCN-O0-NEXT: s_branch .LBB5_5 +; GCN-O0-NEXT: .LBB5_3: ; %bb4 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 ; GCN-O0-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen -; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_mov_b32 s6, 0 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v1, s4 -; GCN-O0-NEXT: s_mov_b32 s8, s4 -; GCN-O0-NEXT: s_mov_b32 s9, s4 -; GCN-O0-NEXT: s_mov_b32 s10, s4 -; GCN-O0-NEXT: s_mov_b32 s11, s4 +; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, s6 +; GCN-O0-NEXT: s_mov_b32 s8, s6 +; GCN-O0-NEXT: s_mov_b32 s9, s6 +; GCN-O0-NEXT: s_mov_b32 s10, s6 +; GCN-O0-NEXT: s_mov_b32 s11, s6 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 ; GCN-O0-NEXT: v_mov_b32_e32 v3, s10 @@ -1197,49 +1227,49 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 12 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 13 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_mov_b64 s[6:7], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 10 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 11 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execz .LBB5_6 -; GCN-O0-NEXT: ; %bb.4: ; %bb8 +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB5_4 +; GCN-O0-NEXT: s_branch .LBB5_6 +; GCN-O0-NEXT: .LBB5_4: ; %bb8 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_mov_b32 s10, 0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; implicit-def: $sgpr9 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; implicit-def: $sgpr8 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b32 s5, s10 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 10 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 11 +; GCN-O0-NEXT: s_mov_b32 s12, 0 +; GCN-O0-NEXT: ; implicit-def: $sgpr8 +; GCN-O0-NEXT: ; implicit-def: $sgpr6 +; GCN-O0-NEXT: ; implicit-def: $sgpr7 +; GCN-O0-NEXT: ; implicit-def: $sgpr6 +; GCN-O0-NEXT: ; implicit-def: $sgpr6 +; GCN-O0-NEXT: ; implicit-def: $sgpr9 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; GCN-O0-NEXT: s_mov_b32 s9, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s7 +; GCN-O0-NEXT: s_mov_b32 s11, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s11 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_branch .LBB5_6 ; GCN-O0-NEXT: .LBB5_5: ; %Flow2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v4, 10 -; GCN-O0-NEXT: v_readlane_b32 s5, v4, 11 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -1254,114 +1284,103 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_branch .LBB5_7 ; GCN-O0-NEXT: .LBB5_6: ; %Flow ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v4, 12 -; GCN-O0-NEXT: v_readlane_b32 s5, v4, 13 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v4, 8 +; GCN-O0-NEXT: v_readlane_b32 s5, v4, 9 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_branch .LBB5_5 ; GCN-O0-NEXT: .LBB5_7: ; %bb10 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(3) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s6, v0, 8 -; GCN-O0-NEXT: v_readlane_b32 s7, v0, 9 -; GCN-O0-NEXT: s_mov_b64 s[4:5], -1 -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 14 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 15 -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 16 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 17 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 6 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], -1 +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 12 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 13 +; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-O0-NEXT: s_mov_b64 s[6:7], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 14 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 15 +; GCN-O0-NEXT: s_mov_b64 s[16:17], exec +; GCN-O0-NEXT: s_mov_b64 exec, -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execz .LBB5_9 -; GCN-O0-NEXT: ; %bb.8: ; %Flow1 +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB5_8 +; GCN-O0-NEXT: s_branch .LBB5_9 +; GCN-O0-NEXT: .LBB5_8: ; %Flow1 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 -; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1 +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 14 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 15 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 14 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 15 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0 +; GCN-O0-NEXT: s_xor_b64 s[6:7], exec, -1 +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 12 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 13 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: .LBB5_9: ; %Flow3 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s8, v4, 16 -; GCN-O0-NEXT: v_readlane_b32 s9, v4, 17 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-O0-NEXT: v_readlane_b32 s6, v4, 4 -; GCN-O0-NEXT: v_readlane_b32 s7, v4, 5 -; GCN-O0-NEXT: v_readlane_b32 s4, v4, 14 -; GCN-O0-NEXT: v_readlane_b32 s5, v4, 15 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s6, v4, 4 +; GCN-O0-NEXT: v_readlane_b32 s7, v4, 5 +; GCN-O0-NEXT: v_readlane_b32 s4, v4, 12 +; GCN-O0-NEXT: v_readlane_b32 s5, v4, 13 ; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5] -; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], s[4:5] +; GCN-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] +; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 +; GCN-O0-NEXT: s_mov_b64 s[8:9], s[6:7] ; GCN-O0-NEXT: v_writelane_b32 v4, s8, 0 ; GCN-O0-NEXT: v_writelane_b32 v4, s9, 1 -; GCN-O0-NEXT: v_writelane_b32 v4, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v4, s7, 3 -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v4, s6, 18 -; GCN-O0-NEXT: v_writelane_b32 v4, s7, 19 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: v_writelane_b32 v4, s4, 2 +; GCN-O0-NEXT: v_writelane_b32 v4, s5, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 +; GCN-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GCN-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB5_1 ; GCN-O0-NEXT: ; %bb.10: ; %bb12 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(3) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 18 -; GCN-O0-NEXT: v_readlane_b32 s5, v0, 19 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: ; %bb.11: ; %bb12 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir index 48ca53732ed061..c8654c669967be 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir @@ -12,24 +12,33 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: DBG_VALUE + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: DBG_VALUE ; GCN-NEXT: S_ENDPGM 0 bb.0: @@ -43,14 +52,14 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec DBG_VALUE + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: DBG_VALUE - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -66,27 +75,35 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.5(0x40000000), %bb.1(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -97,14 +114,14 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -120,29 +137,36 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: DBG_VALUE + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -155,15 +179,15 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: DBG_VALUE + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -179,31 +203,35 @@ body: | ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]] ; GCN-NEXT: KILL [[DEF]] + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -217,15 +245,15 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: %4:sgpr_32 = IMPLICIT_DEF %5:sgpr_32 = S_BREV_B32 %4 KILL %4 - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -242,22 +270,26 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} @@ -265,9 +297,9 @@ body: | ; GCN-NEXT: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]] ; GCN-NEXT: KILL [[DEF]] ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[S_BREV_B32_]] + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -280,16 +312,16 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %4:sgpr_32 = IMPLICIT_DEF %5:sgpr_32 = S_BREV_B32 %4 KILL %4 %6:sgpr_32 = COPY %5 + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -305,30 +337,33 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc ; GCN-NEXT: [[S_BREV_B64_:%[0-9]+]]:sreg_64 = S_BREV_B64 $exec + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -341,13 +376,13 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %4:sreg_64 = S_BREV_B64 $exec + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -363,31 +398,34 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %4:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 undef %4:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub2 + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -401,13 +439,13 @@ body: | %3:sreg_64 = SI_IF undef %4:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %3:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %3:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %5:vgpr_32 = COPY %2.sub2 + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -422,31 +460,38 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_BRANCH %bb.5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.4 bb.0: successors: %bb.1, %bb.4 @@ -459,16 +504,16 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.5 bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 bb.5: + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.4 ... @@ -494,7 +539,7 @@ body: | ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.1(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[V_CMP_LT_U32_e64_]], implicit-def $scc + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[V_CMP_LT_U32_e64_]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.1 bb.0: successors: %bb.1 @@ -506,12 +551,12 @@ body: | bb.1: successors: %bb.1 - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.1 ... -# Both s_or_b64 shall be preserved since the outer SI_END_CF belongs to SI_ELSE. +# Both s_or_b64 shall be preserved since the outer SI_WAVE_RECONVERGE belongs to SI_ELSE. --- name: simple_outer_if_else @@ -523,43 +568,48 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], [[COPY]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], $exec, implicit-def $scc + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[S_XOR_B64_]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.6(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[S_XOR_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc - ; GCN-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_B64_1]], implicit-def $scc - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec + ; GCN-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_XOR_B64_]], $exec, implicit-def $scc + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_XOR_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_XOR_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.6 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %4:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_2]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 undef %4:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_3]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.4, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.6(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[S_XOR_B64_1]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.6: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[S_AND_B64_1]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -568,6 +618,7 @@ body: | bb.1: successors: %bb.2 + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.2 bb.2: @@ -581,14 +632,14 @@ body: | bb.4: successors: %bb.5 + SI_WAVE_RECONVERGE %3:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: successors: %bb.6 - SI_END_CF %3:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.6: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -608,38 +659,38 @@ body: | ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.6(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.6 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: $exec = S_OR_B64_term $exec, %2, implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, %2, implicit-def $scc - ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.6(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.6: ; GCN-NEXT: successors: %bb.4(0x40000000), %bb.0(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_1]], [[COPY1]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.0 + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_1]], $exec, implicit-def $scc + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_1]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.0, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: S_ENDPGM 0 bb.0: S_BRANCH %bb.6 @@ -648,13 +699,13 @@ body: | %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.6 bb.3: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: @@ -678,27 +729,34 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc - ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: ; GCN-NEXT: S_ENDPGM 0 bb.0: @@ -712,13 +770,13 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: S_ENDPGM 0 @@ -740,20 +798,27 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_BRANCH %bb.5 @@ -764,11 +829,10 @@ body: | ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.6(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc - ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.6: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.4 bb.0: successors: %bb.1, %bb.4 @@ -781,9 +845,9 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.5 bb.4: @@ -791,9 +855,9 @@ body: | bb.5: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.6: + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.4 ... @@ -815,54 +879,66 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, killed [[DEF]], implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.14 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.14(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, killed [[DEF1]], implicit $exec - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], killed [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_1]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_1]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.6 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.7(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CMP_EQ_U32_e64_2:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, killed [[DEF2]], implicit $exec - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], killed [[V_CMP_EQ_U32_e64_2]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_2]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_2]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_2]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.7 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: - ; GCN-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) + ; GCN-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CMP_EQ_U32_e64_3:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, killed [[DEF3]], implicit $exec - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY3]], killed [[V_CMP_EQ_U32_e64_3]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_3]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.4 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_3]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_3]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.4, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: + ; GCN-NEXT: successors: %bb.5(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc + ; GCN-NEXT: S_BRANCH %bb.5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.7(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.7 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.6: + ; GCN-NEXT: successors: %bb.14(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: S_BRANCH %bb.14 + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.7: ; GCN-NEXT: successors: %bb.8(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.8 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.8: @@ -875,34 +951,40 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CMP_EQ_U32_e64_4:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, killed [[DEF4]], implicit $exec - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_4:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], killed [[V_CMP_EQ_U32_e64_4]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_4]], [[COPY4]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_4]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.11 + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_EQ_U32_e64_4]], $exec, implicit-def $scc + ; GCN-NEXT: [[S_AND_B64_4:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_4]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_4]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.11, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.12 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.10: - ; GCN-NEXT: successors: %bb.14(0x80000000) + ; GCN-NEXT: successors: %bb.13(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_BRANCH %bb.14 + ; GCN-NEXT: $exec = S_OR_B64_term $exec, %15, implicit-def $scc + ; GCN-NEXT: S_BRANCH %bb.13 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.11: ; GCN-NEXT: successors: %bb.12(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[S_XOR_B64_]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.12 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.12: - ; GCN-NEXT: successors: %bb.10(0x40000000), %bb.14(0x40000000) + ; GCN-NEXT: successors: %bb.10(0x40000000), %bb.13(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_XOR_B64_]], $exec, implicit-def $scc + ; GCN-NEXT: [[S_AND_B64_5:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_XOR_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_XOR_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.10, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.13 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[S_XOR_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN-NEXT: [[S_AND_B64_5:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc - ; GCN-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_B64_5]], implicit-def $scc - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.10 + ; GCN-NEXT: bb.13: + ; GCN-NEXT: successors: %bb.6(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: S_BRANCH %bb.6 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.14: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.14 @@ -939,24 +1021,24 @@ body: | bb.4: successors: %bb.5 + SI_WAVE_RECONVERGE %11:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.5 bb.5: successors: %bb.7 - SI_END_CF %11:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %8:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.7 bb.6: successors: %bb.14 - SI_END_CF %5:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.14 bb.7: successors: %bb.8 - SI_END_CF %8:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.8 bb.8: @@ -975,11 +1057,13 @@ body: | bb.10: successors: %bb.13 + SI_WAVE_RECONVERGE %15:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.13 bb.11: successors: %bb.12 + SI_WAVE_RECONVERGE %14:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.12 bb.12: @@ -991,12 +1075,11 @@ body: | bb.13: successors: %bb.6 - SI_END_CF %15:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %5:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.6 bb.14: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index 3db2b6ed9ab4ba..07eed859ad16e3 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -414,10 +414,10 @@ body: | %6:sreg_64 = S_MOV_B64 0 %7:sreg_64 = S_AND_B64 $exec, killed %6, implicit-def dead $scc $vcc = COPY %7 + SI_WAVE_RECONVERGE %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: liveins: $vcc - SI_END_CF %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0, implicit $vcc ... diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 09dc6d6bff9e31..ccb0033a2eb2f3 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -148,9 +148,9 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 { ; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, func_struct@rel32@hi+12 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_mov_b32_e32 v1, v4 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_mov_b32_e32 v1, v4 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll index 0d74bd39b56fec..a606bcb04b9d0d 100644 --- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll +++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll @@ -10,20 +10,21 @@ define i32 @test(i32 %val, i32 %cond) { ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_mov_b32 exec_lo, s4 -; GCN-NEXT: s_or_saveexec_b32 s4, -1 +; GCN-NEXT: s_mov_b32 s4, exec_lo +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_mov_b32 exec_lo, s4 -; GCN-NEXT: v_mov_b32_e32 v3, v0 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo +; GCN-NEXT: s_or_saveexec_b32 s5, -1 ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: s_or_saveexec_b32 s4, -1 -; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf -; GCN-NEXT: s_mov_b32 exec_lo, s4 -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GCN-NEXT: s_mov_b32 exec_lo, s5 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v1, v3 +; GCN-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %if ; GCN-NEXT: s_or_saveexec_b32 s5, -1 ; GCN-NEXT: v_mov_b32_e32 v2, 0 @@ -35,10 +36,10 @@ define i32 @test(i32 %val, i32 %cond) { ; GCN-NEXT: s_or_saveexec_b32 s5, -1 ; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s5 -; GCN-NEXT: v_mov_b32_e32 v5, v2 -; GCN-NEXT: ; %bb.2: ; %end +; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: v_add_nc_u32_e32 v0, v4, v5 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v4 ; GCN-NEXT: s_xor_saveexec_b32 s4, -1 ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index de14d64dbf7e9d..0e9a3d706cc335 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -134,10 +134,10 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] -; CHECK-NEXT: s_cbranch_execnz .LBB5_1 +; CHECK-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; CHECK-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; CHECK-NEXT: s_cbranch_scc1 .LBB5_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: v_mov_b32_e32 v3, s7 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] @@ -435,10 +435,10 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] -; CHECK-NEXT: s_cbranch_execnz .LBB18_1 +; CHECK-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; CHECK-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; CHECK-NEXT: s_cbranch_scc1 .LBB18_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 @@ -472,10 +472,10 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] -; CHECK-NEXT: s_cbranch_execnz .LBB19_1 +; CHECK-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; CHECK-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; CHECK-NEXT: s_cbranch_scc1 .LBB19_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll index 5cadb65c9c942f..1092386eb90c28 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll @@ -18,11 +18,11 @@ define i32 @divergent_lshr_and_cmp(i32 %x) { ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2 ; GCN-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2.UnifiedReturnBlock: ; GCN-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[V_LSHLREV_B32_e64_]], %bb.1 - ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = COPY [[PHI]] ; GCN-NEXT: SI_RETURN implicit $vgpr0 entry: diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll index eecc91239c7283..3e200abe213eb2 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll @@ -8,13 +8,15 @@ define void @wombat(i1 %cond, ptr addrspace(5) %addr) { ; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %then ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: .LBB0_2: ; %end ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB0_2: ; %end ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_byte v2, v1, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index fea1303d0a2b76..90f5f89763712e 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -69,12 +69,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subbrev_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v18, v16 +; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v18, v16 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v19, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 @@ -84,13 +85,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, v11, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v12, v10, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, 0, s[4:5] +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, 0, s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB0_6 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB0_6 ; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 1, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc @@ -109,20 +111,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v5, v5, v12 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v7, v[8:9] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, exec ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_5 ; GFX9-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-NEXT: v_sub_u32_e32 v12, 64, v22 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v22, v[8:9] @@ -179,22 +182,22 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc ; GFX9-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v14, v22, v24 ; GFX9-NEXT: v_or_b32_e32 v15, v23, v25 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12 ; GFX9-NEXT: v_and_b32_e32 v6, 1, v30 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-NEXT: v_or3_b32 v3, v3, 0, v13 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB0_3 +; GFX9-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX9-NEXT: ; %bb.4: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB0_5: ; %Flow2 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB0_5: ; %Flow2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 @@ -202,8 +205,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or3_b32 v12, v2, v4, v12 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v1 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v0 -; GFX9-NEXT: .LBB0_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: .LBB0_6: ; %udiv-end ; GFX9-NEXT: v_xor_b32_e32 v2, v17, v16 ; GFX9-NEXT: v_xor_b32_e32 v3, v19, v18 ; GFX9-NEXT: v_xor_b32_e32 v0, v6, v2 @@ -221,8 +224,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane @@ -536,32 +539,25 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-O0-NEXT: s_mov_b64 s[22:23], exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-O0-NEXT: s_branch .LBB0_8 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB0_7 +; GFX9-O0-NEXT: s_branch .LBB0_2 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 7 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: ; %bb.2: ; %Flow ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload @@ -570,7 +566,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill @@ -583,15 +584,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_5 -; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB0_4 +; GFX9-O0-NEXT: .LBB0_2: ; %Flow2 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -603,8 +598,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_9 -; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit +; GFX9-O0-NEXT: s_branch .LBB0_8 +; GFX9-O0-NEXT: .LBB0_3: ; %udiv-loop-exit ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload @@ -613,13 +608,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] -; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5 +; GFX9-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s6, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s6, v[9:10] +; GFX9-O0-NEXT: s_mov_b32 s6, 63 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s6, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 @@ -643,15 +642,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_3 -; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB0_2 +; GFX9-O0-NEXT: .LBB0_4: ; %Flow1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -673,15 +666,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_4 -; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while +; GFX9-O0-NEXT: s_branch .LBB0_3 +; GFX9-O0-NEXT: .LBB0_5: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 10 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 11 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -698,6 +685,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload @@ -706,8 +696,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v30 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 @@ -734,7 +726,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v30 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 @@ -744,7 +735,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 @@ -842,7 +832,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -863,12 +853,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 7 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 10 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 11 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] @@ -896,10 +883,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 +; GFX9-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_1 -; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader +; GFX9-O0-NEXT: .LBB0_6: ; %udiv-preheader ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload @@ -1002,8 +990,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 10 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 11 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] @@ -1031,8 +1019,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_6 -; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_branch .LBB0_5 +; GFX9-O0-NEXT: .LBB0_7: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] @@ -1157,18 +1145,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 9 +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 -; GFX9-O0-NEXT: s_branch .LBB0_7 -; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end +; GFX9-O0-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB0_6 +; GFX9-O0-NEXT: s_branch .LBB0_4 +; GFX9-O0-NEXT: .LBB0_8: ; %udiv-end ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] @@ -1224,8 +1211,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -1237,9 +1226,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_ashrrev_i32_e32 v16, 31, v3 ; GFX9-G-NEXT: v_xor_b32_e32 v0, v16, v0 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v16, v1 -; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v0, v16 +; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v16 ; GFX9-G-NEXT: v_xor_b32_e32 v2, v16, v2 -; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v1, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v16, vcc ; GFX9-G-NEXT: v_ashrrev_i32_e32 v17, 31, v7 ; GFX9-G-NEXT: v_xor_b32_e32 v3, v16, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v16, vcc @@ -1255,8 +1244,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v0, v18, v4 ; GFX9-G-NEXT: v_or_b32_e32 v1, v19, v5 ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v0, v10, v12 -; GFX9-G-NEXT: v_or_b32_e32 v1, v11, v13 +; GFX9-G-NEXT: v_or_b32_e32 v0, v8, v12 +; GFX9-G-NEXT: v_or_b32_e32 v1, v9, v13 ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v18 ; GFX9-G-NEXT: v_ffbh_u32_e32 v0, v19 @@ -1268,9 +1257,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[4:5] ; GFX9-G-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v8 ; GFX9-G-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7] -; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v11 +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v9 ; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2 ; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v12 ; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 @@ -1293,60 +1282,64 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v15, v1, v3 ; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[6:7] ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] -; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: s_mov_b64 s[12:13], exec ; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[6:7] ; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GFX9-G-NEXT: v_or_b32_e32 v20, v7, v6 +; GFX9-G-NEXT: v_or_b32_e32 v11, v7, v6 ; GFX9-G-NEXT: v_xor_b32_e32 v6, 0x7f, v0 ; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2 -; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] +; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v11 +; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v14 +; GFX9-G-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX9-G-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 ; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v12, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v13, 0, vcc -; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14 -; GFX9-G-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-G-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-G-NEXT: s_cbranch_execz .LBB0_6 +; GFX9-G-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v9, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc +; GFX9-G-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc +; GFX9-G-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-NEXT: s_cbranch_scc0 .LBB0_6 ; GFX9-G-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, 1, v0 ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v1, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v2, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc ; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, 0x7f, v0 -; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v8 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v8, v[12:13] -; GFX9-G-NEXT: v_subrev_u32_e32 v9, 64, v8 -; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v8, v[10:11] +; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, 0x7f, v0 +; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v10 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v10, v[12:13] +; GFX9-G-NEXT: v_subrev_u32_e32 v11, 64, v10 +; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v10, v[8:9] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v9, v[10:11] -; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v11, v[8:9] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v0, v12, vcc -; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v1, v13, vcc +; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; GFX9-G-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-G-NEXT: v_cndmask_b32_e32 v10, v0, v12, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v11, v1, v13, vcc ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-NEXT: s_xor_b64 s[6:7], s[4:5], exec ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9] -; GFX9-G-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-G-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-NEXT: s_cbranch_scc0 .LBB0_5 ; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[10:11] +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[8:9] ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13] ; GFX9-G-NEXT: v_subrev_u32_e32 v24, 64, v20 ; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13] @@ -1359,27 +1352,26 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc ; GFX9-G-NEXT: v_add_co_u32_e32 v24, vcc, -1, v18 -; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20 ; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v10, s[4:5] -; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v11, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v8, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v9, s[4:5] ; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc -; GFX9-G-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-G-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v7 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v8, 31, v7 ; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[12:13] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v11 ; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] ; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 ; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v13 @@ -1401,36 +1393,36 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc -; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22 ; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23 +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX9-G-NEXT: v_and_b32_e32 v10, 1, v28 -; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v8 +; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v28 ; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 -; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3 +; GFX9-G-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-G-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-G-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; GFX9-G-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX9-G-NEXT: ; %bb.4: ; %Flow -; GFX9-G-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-G-NEXT: .LBB0_5: ; %Flow2 -; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] -; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] ; GFX9-G-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v4 +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v4 ; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 -; GFX9-G-NEXT: .LBB0_6: ; %Flow3 -; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX9-G-NEXT: .LBB0_6: ; %udiv-end ; GFX9-G-NEXT: v_xor_b32_e32 v3, v17, v16 ; GFX9-G-NEXT: v_xor_b32_e32 v0, v6, v3 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v7, v3 ; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 -; GFX9-G-NEXT: v_xor_b32_e32 v2, v8, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v10, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-G-NEXT: v_xor_b32_e32 v4, v9, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v4, v11, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc ; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-G-NEXT: s_setpc_b64 s[30:31] @@ -1440,10 +1432,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 @@ -1680,31 +1671,24 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5 ; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1 -; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0 -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 0 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 1 +; GFX9-G-O0-NEXT: s_mov_b64 s[20:21], exec +; GFX9-G-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-G-O0-NEXT: s_branch .LBB0_8 +; GFX9-G-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_scc1 .LBB0_7 +; GFX9-G-O0-NEXT: s_branch .LBB0_2 ; GFX9-G-O0-NEXT: .LBB0_1: ; %Flow -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3 -; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -1713,7 +1697,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 2 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 3 ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -1724,15 +1713,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB0_5 -; GFX9-G-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_branch .LBB0_4 +; GFX9-G-O0-NEXT: .LBB0_2: ; %Flow2 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -1743,8 +1726,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB0_9 -; GFX9-G-O0-NEXT: .LBB0_4: ; %udiv-loop-exit +; GFX9-G-O0-NEXT: s_branch .LBB0_8 +; GFX9-G-O0-NEXT: .LBB0_3: ; %udiv-loop-exit ; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload @@ -1753,23 +1736,28 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5] ; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec ; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec -; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3 -; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 @@ -1798,15 +1786,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB0_3 -; GFX9-G-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_branch .LBB0_2 +; GFX9-G-O0-NEXT: .LBB0_4: ; %Flow1 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload @@ -1826,15 +1808,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB0_4 -; GFX9-G-O0-NEXT: .LBB0_6: ; %udiv-do-while +; GFX9-G-O0-NEXT: s_branch .LBB0_3 +; GFX9-G-O0-NEXT: .LBB0_5: ; %udiv-do-while ; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6 -; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7 ; GFX9-G-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload @@ -1851,6 +1827,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-G-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload @@ -1859,8 +1838,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 5 ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 @@ -1912,7 +1893,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v30, v32 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v25, v33 @@ -1995,7 +1975,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4 ; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20] -; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1 @@ -2014,12 +1994,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2 -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3 -; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6 -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s4, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s5, 5 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] @@ -2043,10 +2020,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB0_6 +; GFX9-G-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-G-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_cbranch_scc1 .LBB0_5 ; GFX9-G-O0-NEXT: s_branch .LBB0_1 -; GFX9-G-O0-NEXT: .LBB0_7: ; %udiv-preheader +; GFX9-G-O0-NEXT: .LBB0_6: ; %udiv-preheader ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -2132,8 +2110,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] -; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6 -; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 5 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] @@ -2161,8 +2139,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB0_6 -; GFX9-G-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-G-O0-NEXT: s_branch .LBB0_5 +; GFX9-G-O0-NEXT: .LBB0_7: ; %udiv-bb1 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] @@ -2273,18 +2251,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec -; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 2 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 3 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_5 -; GFX9-G-O0-NEXT: s_branch .LBB0_7 -; GFX9-G-O0-NEXT: .LBB0_9: ; %udiv-end +; GFX9-G-O0-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_scc1 .LBB0_6 +; GFX9-G-O0-NEXT: s_branch .LBB0_4 +; GFX9-G-O0-NEXT: .LBB0_8: ; %udiv-end ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] @@ -2317,10 +2294,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] @@ -2375,6 +2351,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v10, v13, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc @@ -2385,13 +2362,14 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc ; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[4:5] +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX9-NEXT: v_cndmask_b32_e64 v11, v0, 0, s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_6 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB1_6 ; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 1, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v13, vcc @@ -2410,20 +2388,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v10, v10, v13 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15 -; GFX9-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, exec ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-NEXT: v_sub_u32_e32 v14, 64, v18 ; GFX9-NEXT: v_lshrrev_b64 v[12:13], v18, v[0:1] @@ -2486,16 +2465,16 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v17, v19, v21 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GFX9-NEXT: v_and_b32_e32 v12, 1, v26 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX9-NEXT: ; %bb.4: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB1_5: ; %Flow2 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB1_5: ; %Flow2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[10:11] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[8:9] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v11 @@ -2503,8 +2482,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or3_b32 v9, v2, v4, v14 ; GFX9-NEXT: v_or_b32_e32 v10, v13, v1 ; GFX9-NEXT: v_or_b32_e32 v11, v12, v0 -; GFX9-NEXT: .LBB1_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: .LBB1_6: ; %udiv-end ; GFX9-NEXT: v_mov_b32_e32 v0, v11 ; GFX9-NEXT: v_mov_b32_e32 v1, v10 ; GFX9-NEXT: v_mov_b32_e32 v2, v9 @@ -2516,8 +2495,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane @@ -2741,32 +2720,25 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 3 +; GFX9-O0-NEXT: s_mov_b64 s[18:19], exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB1_3 -; GFX9-O0-NEXT: s_branch .LBB1_8 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_7 +; GFX9-O0-NEXT: s_branch .LBB1_2 ; GFX9-O0-NEXT: .LBB1_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: ; %bb.2: ; %Flow ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -2775,7 +2747,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 5 ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill @@ -2788,15 +2765,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_5 -; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB1_4 +; GFX9-O0-NEXT: .LBB1_2: ; %Flow2 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2808,8 +2779,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_9 -; GFX9-O0-NEXT: .LBB1_4: ; %udiv-loop-exit +; GFX9-O0-NEXT: s_branch .LBB1_8 +; GFX9-O0-NEXT: .LBB1_3: ; %udiv-loop-exit ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload @@ -2818,13 +2789,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] -; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 3 +; GFX9-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s6, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s6, v[9:10] +; GFX9-O0-NEXT: s_mov_b32 s6, 63 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s6, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 @@ -2848,15 +2823,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_3 -; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB1_2 +; GFX9-O0-NEXT: .LBB1_4: ; %Flow1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -2878,15 +2847,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_4 -; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while +; GFX9-O0-NEXT: s_branch .LBB1_3 +; GFX9-O0-NEXT: .LBB1_5: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload @@ -2903,6 +2866,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload @@ -2911,8 +2877,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 6 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 7 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v30 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 @@ -2939,7 +2907,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v30 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 @@ -2949,7 +2916,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 @@ -3047,7 +3013,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill @@ -3068,12 +3034,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -3101,10 +3064,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execnz .LBB1_6 +; GFX9-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_1 -; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader +; GFX9-O0-NEXT: .LBB1_6: ; %udiv-preheader ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload @@ -3207,8 +3171,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -3236,8 +3200,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_6 -; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_branch .LBB1_5 +; GFX9-O0-NEXT: .LBB1_7: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -3362,18 +3326,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 -; GFX9-O0-NEXT: s_branch .LBB1_7 -; GFX9-O0-NEXT: .LBB1_9: ; %udiv-end +; GFX9-O0-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_6 +; GFX9-O0-NEXT: s_branch .LBB1_4 +; GFX9-O0-NEXT: .LBB1_8: ; %udiv-end ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -3393,8 +3356,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -3444,26 +3409,28 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v17, v13, v15 ; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[6:7] ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] -; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: s_mov_b64 s[12:13], exec ; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7] ; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GFX9-G-NEXT: v_or_b32_e32 v18, v9, v8 +; GFX9-G-NEXT: v_or_b32_e32 v9, v9, v8 ; GFX9-G-NEXT: v_xor_b32_e32 v8, 0x7f, v12 ; GFX9-G-NEXT: v_or_b32_e32 v16, v8, v14 -; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v18 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[16:17] +; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v9 +; GFX9-G-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GFX9-G-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX9-G-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX9-G-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 ; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-G-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, 0, vcc ; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, 0, vcc ; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; GFX9-G-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc -; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; GFX9-G-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GFX9-G-NEXT: v_or_b32_e32 v16, v18, v16 -; GFX9-G-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-G-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-G-NEXT: s_cbranch_execz .LBB1_6 +; GFX9-G-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-NEXT: s_cbranch_scc0 .LBB1_6 ; GFX9-G-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-G-NEXT: v_add_co_u32_e32 v18, vcc, 1, v12 ; GFX9-G-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v13, vcc @@ -3481,20 +3448,22 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], v14, v[0:1] ; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v13, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 +; GFX9-G-NEXT: s_xor_b64 s[6:7], s[4:5], exec ; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v3, vcc ; GFX9-G-NEXT: v_mov_b32_e32 v11, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v10, s8 +; GFX9-G-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 -; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9] -; GFX9-G-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-G-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-G-NEXT: v_sub_u32_e32 v12, 64, v18 ; GFX9-G-NEXT: v_subrev_u32_e32 v22, 64, v18 @@ -3505,7 +3474,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12 ; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v13 ; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc @@ -3557,24 +3525,24 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v11, v19, v21 ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] ; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v12, v17, vcc +; GFX9-G-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v13, v26, vcc -; GFX9-G-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX9-G-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-G-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; GFX9-G-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX9-G-NEXT: ; %bb.4: ; %Flow -; GFX9-G-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-G-NEXT: .LBB1_5: ; %Flow2 -; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] ; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] ; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v15 ; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v2 ; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v0 ; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v1 -; GFX9-G-NEXT: .LBB1_6: ; %Flow3 -; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX9-G-NEXT: .LBB1_6: ; %udiv-end ; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 ; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 ; GFX9-G-NEXT: v_mov_b32_e32 v2, v8 @@ -3586,10 +3554,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 @@ -3796,31 +3763,24 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5 ; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1 -; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0 -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 0 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 1 +; GFX9-G-O0-NEXT: s_mov_b64 s[18:19], exec +; GFX9-G-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-G-O0-NEXT: s_cbranch_execz .LBB1_3 -; GFX9-G-O0-NEXT: s_branch .LBB1_8 +; GFX9-G-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_scc1 .LBB1_7 +; GFX9-G-O0-NEXT: s_branch .LBB1_2 ; GFX9-G-O0-NEXT: .LBB1_1: ; %Flow -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3 -; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -3829,7 +3789,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 2 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 3 ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -3840,15 +3805,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB1_5 -; GFX9-G-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_branch .LBB1_4 +; GFX9-G-O0-NEXT: .LBB1_2: ; %Flow2 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -3859,8 +3818,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB1_9 -; GFX9-G-O0-NEXT: .LBB1_4: ; %udiv-loop-exit +; GFX9-G-O0-NEXT: s_branch .LBB1_8 +; GFX9-G-O0-NEXT: .LBB1_3: ; %udiv-loop-exit ; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload @@ -3869,23 +3828,28 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5] ; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec ; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec -; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3 -; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 @@ -3914,15 +3878,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB1_3 -; GFX9-G-O0-NEXT: .LBB1_5: ; %Flow1 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_branch .LBB1_2 +; GFX9-G-O0-NEXT: .LBB1_4: ; %Flow1 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload @@ -3942,15 +3900,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB1_4 -; GFX9-G-O0-NEXT: .LBB1_6: ; %udiv-do-while +; GFX9-G-O0-NEXT: s_branch .LBB1_3 +; GFX9-G-O0-NEXT: .LBB1_5: ; %udiv-do-while ; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6 -; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7 ; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload @@ -3967,6 +3919,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -3975,8 +3930,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 5 ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 @@ -4028,7 +3985,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v30, v32 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v33 @@ -4119,7 +4075,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4 ; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20] -; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1 @@ -4138,12 +4094,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2 -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3 -; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6 -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s4, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s5, 5 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -4167,10 +4120,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB1_6 +; GFX9-G-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-G-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX9-G-O0-NEXT: s_branch .LBB1_1 -; GFX9-G-O0-NEXT: .LBB1_7: ; %udiv-preheader +; GFX9-G-O0-NEXT: .LBB1_6: ; %udiv-preheader ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload @@ -4260,8 +4214,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] -; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6 -; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 5 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -4289,8 +4243,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB1_6 -; GFX9-G-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-G-O0-NEXT: s_branch .LBB1_5 +; GFX9-G-O0-NEXT: .LBB1_7: ; %udiv-bb1 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -4401,18 +4355,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec -; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 2 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 3 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-G-O0-NEXT: s_cbranch_execz .LBB1_5 -; GFX9-G-O0-NEXT: s_branch .LBB1_7 -; GFX9-G-O0-NEXT: .LBB1_9: ; %udiv-end +; GFX9-G-O0-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_scc1 .LBB1_6 +; GFX9-G-O0-NEXT: s_branch .LBB1_4 +; GFX9-G-O0-NEXT: .LBB1_8: ; %udiv-end ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -4429,10 +4382,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index e04cd711256081..8a0a901f46efec 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -6,11 +6,12 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-LABEL: v_sdiv_v2i128_vv: ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b64 s[10:11], exec ; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v3 ; SDAG-NEXT: v_ashrrev_i32_e32 v27, 31, v11 ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: s_mov_b64 s[12:13], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v28, v26 ; SDAG-NEXT: v_mov_b32_e32 v29, v27 ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc @@ -67,7 +68,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v10, vcc ; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v8 ; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v19, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[12:13], v[8:9] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v19, vcc ; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 @@ -84,10 +85,11 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v21, v2, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v22, v17, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec ; SDAG-NEXT: v_cndmask_b32_e64 v23, v16, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB0_6 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 ; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v8 ; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8 @@ -107,26 +109,27 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25 ; SDAG-NEXT: v_or_b32_e32 v9, v9, v19 ; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 ; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 ; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB0_5 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 ; SDAG-NEXT: v_lshr_b64 v[10:11], v[16:17], v32 ; SDAG-NEXT: v_sub_i32_e32 v37, vcc, 64, v32 ; SDAG-NEXT: v_subrev_i32_e32 v48, vcc, 64, v32 ; SDAG-NEXT: v_lshr_b64 v[24:25], v[2:3], v32 ; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0 ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: v_mov_b32_e32 v18, 0 @@ -183,16 +186,16 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v17, v33, v35 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 -; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] ; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 ; SDAG-NEXT: v_mov_b32_e32 v23, v11 ; SDAG-NEXT: v_mov_b32_e32 v22, v10 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB0_3 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; SDAG-NEXT: s_cbranch_scc1 .LBB0_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB0_5: ; %Flow14 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 @@ -201,13 +204,14 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v22, v11, v3 ; SDAG-NEXT: v_or_b32_e32 v21, v18, v0 ; SDAG-NEXT: v_or_b32_e32 v23, v10, v2 -; SDAG-NEXT: .LBB0_6: ; %Flow16 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB0_6: ; %udiv-end1 +; SDAG-NEXT: s_mov_b64 s[10:11], exec ; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v7 ; SDAG-NEXT: v_ashrrev_i32_e32 v17, 31, v15 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 ; SDAG-NEXT: v_mov_b32_e32 v9, 0 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: s_mov_b64 s[12:13], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v18, v16 ; SDAG-NEXT: v_mov_b32_e32 v19, v17 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v5, vcc @@ -264,7 +268,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v12, vcc ; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v6 ; SDAG-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v9, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[12:13], v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc ; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 @@ -281,10 +285,11 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec ; SDAG-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB0_12 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6 ; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6 @@ -304,26 +309,27 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[6:7], v[2:3], v6 ; SDAG-NEXT: v_or_b32_e32 v7, v15, v7 ; SDAG-NEXT: v_or_b32_e32 v6, v14, v6 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v9 ; SDAG-NEXT: v_cndmask_b32_e64 v8, v13, v7, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v35, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v34, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 ; SDAG-NEXT: v_cndmask_b32_e64 v9, v8, v5, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB0_11 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader ; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v30 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 ; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 ; SDAG-NEXT: v_lshr_b64 v[37:38], v[4:5], v30 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v25 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0 ; SDAG-NEXT: v_mov_b32_e32 v14, 0 ; SDAG-NEXT: v_mov_b32_e32 v15, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, 0 @@ -380,16 +386,16 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 ; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] -; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] ; SDAG-NEXT: v_or_b32_e32 v6, v14, v6 ; SDAG-NEXT: v_mov_b32_e32 v15, v11 ; SDAG-NEXT: v_mov_b32_e32 v14, v10 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB0_9 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; SDAG-NEXT: s_cbranch_scc1 .LBB0_9 ; SDAG-NEXT: ; %bb.10: ; %Flow -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB0_11: ; %Flow11 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v7 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], 1 @@ -398,8 +404,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v14, v11, v3 ; SDAG-NEXT: v_or_b32_e32 v11, v12, v0 ; SDAG-NEXT: v_or_b32_e32 v10, v10, v2 -; SDAG-NEXT: .LBB0_12: ; %Flow12 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB0_12: ; %udiv-end ; SDAG-NEXT: v_xor_b32_e32 v3, v29, v28 ; SDAG-NEXT: v_xor_b32_e32 v2, v27, v26 ; SDAG-NEXT: v_xor_b32_e32 v7, v19, v18 @@ -425,6 +431,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-LABEL: v_sdiv_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v24, 31, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v25, 31, v11 @@ -496,14 +503,15 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v8, v9, v8 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, v16, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v22, 1, v8 +; GISEL-NEXT: v_and_b32_e32 v9, 1, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v21, v17, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, v18, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e64 v9, v19, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB0_6 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 ; GISEL-NEXT: v_add_i32_e32 v28, vcc, 1, v0 ; GISEL-NEXT: v_addc_u32_e64 v29, s[4:5], 0, v1, vcc @@ -522,19 +530,21 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v8, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v9, v3 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 ; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v18, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v19, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB0_5 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 ; GISEL-NEXT: v_subrev_i32_e32 v34, vcc, 64, v28 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28 @@ -594,66 +604,67 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v16, 1, v0 ; GISEL-NEXT: v_and_b32_e32 v36, v0, v10 ; GISEL-NEXT: v_and_b32_e32 v0, v0, v11 +; GISEL-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v3, v1 ; GISEL-NEXT: v_subb_u32_e32 v23, vcc, v37, v18, vcc ; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v36, vcc ; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v19, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v16 ; GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GISEL-NEXT: s_cbranch_execnz .LBB0_3 +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; GISEL-NEXT: s_cbranch_scc1 .LBB0_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: .LBB0_5: ; %Flow14 ; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] +; GISEL-NEXT: .LBB0_5: ; %Flow14 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 ; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v21 ; GISEL-NEXT: v_or_b32_e32 v8, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 -; GISEL-NEXT: .LBB0_6: ; %Flow16 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB0_6: ; %udiv-end1 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v19, 31, v15 -; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f -; GISEL-NEXT: v_mov_b32_e32 v11, 0 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v17, 0 ; GISEL-NEXT: v_xor_b32_e32 v0, v18, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v18, v5 ; GISEL-NEXT: v_xor_b32_e32 v2, v18, v6 ; GISEL-NEXT: v_xor_b32_e32 v3, v18, v7 ; GISEL-NEXT: v_xor_b32_e32 v4, v19, v12 ; GISEL-NEXT: v_xor_b32_e32 v5, v19, v13 -; GISEL-NEXT: v_xor_b32_e32 v14, v19, v14 -; GISEL-NEXT: v_xor_b32_e32 v15, v19, v15 +; GISEL-NEXT: v_xor_b32_e32 v12, v19, v14 +; GISEL-NEXT: v_xor_b32_e32 v13, v19, v15 ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v18 ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v1, v18, vcc ; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], v4, v19 ; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], v5, v19, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc -; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v19, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v19, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v14, v23 -; GISEL-NEXT: v_ffbh_u32_e32 v15, v22 -; GISEL-NEXT: v_ffbh_u32_e32 v16, v7 -; GISEL-NEXT: v_ffbh_u32_e32 v17, v6 +; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v2, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v3, v18, vcc +; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v12, v19, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v13, v19, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v12, v23 +; GISEL-NEXT: v_ffbh_u32_e32 v13, v22 +; GISEL-NEXT: v_ffbh_u32_e32 v14, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v15, v6 ; GISEL-NEXT: v_or_b32_e32 v0, v22, v4 ; GISEL-NEXT: v_or_b32_e32 v1, v23, v5 -; GISEL-NEXT: v_or_b32_e32 v2, v6, v12 -; GISEL-NEXT: v_or_b32_e32 v3, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15 +; GISEL-NEXT: v_or_b32_e32 v2, v6, v10 +; GISEL-NEXT: v_or_b32_e32 v3, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 32, v13 ; GISEL-NEXT: v_ffbh_u32_e32 v26, v5 ; GISEL-NEXT: v_ffbh_u32_e32 v27, v4 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 32, v17 -; GISEL-NEXT: v_ffbh_u32_e32 v28, v13 -; GISEL-NEXT: v_ffbh_u32_e32 v29, v12 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v10 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] -; GISEL-NEXT: v_min_u32_e32 v0, v14, v15 +; GISEL-NEXT: v_min_u32_e32 v0, v12, v13 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v27 -; GISEL-NEXT: v_min_u32_e32 v2, v16, v17 +; GISEL-NEXT: v_min_u32_e32 v2, v14, v15 ; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v29 ; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 ; GISEL-NEXT: v_min_u32_e32 v1, v26, v1 @@ -663,36 +674,37 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v0 +; GISEL-NEXT: v_xor_b32_e32 v12, 0x7f, v0 ; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v10, v10, v2 -; GISEL-NEXT: v_or_b32_e32 v11, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v12, v12, v2 +; GISEL-NEXT: v_or_b32_e32 v13, v1, v3 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v11, v14, v15 -; GISEL-NEXT: v_and_b32_e32 v14, 1, v11 -; GISEL-NEXT: v_or_b32_e32 v10, v11, v10 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v13, v14, v15 +; GISEL-NEXT: v_and_b32_e32 v14, 1, v13 +; GISEL-NEXT: v_or_b32_e32 v12, v13, v12 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, v6, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v16, 1, v10 +; GISEL-NEXT: v_and_b32_e32 v13, 1, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, v7, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB0_12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 ; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v0 ; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v1, vcc @@ -700,53 +712,55 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v2, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v3, vcc ; GISEL-NEXT: v_subrev_i32_e64 v14, s[4:5], 64, v30 -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v30 +; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], 64, v30 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[6:7], v30 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], v30 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[10:11], v30 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: v_lshr_b64 v[10:11], v[6:7], v10 +; GISEL-NEXT: v_lshr_b64 v[12:13], v[6:7], v12 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v14 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 ; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v1, vcc -; GISEL-NEXT: v_or_b32_e32 v0, v10, v2 -; GISEL-NEXT: v_or_b32_e32 v1, v11, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v12, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v13, v3 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v0, v12, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, v1, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v0, v10, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v1, v11, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB0_11 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader ; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26 ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 64, v26 -; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[10:11], v26 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26 ; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v22 ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v23, vcc -; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v16 -; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v32 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[10:11], v16 +; GISEL-NEXT: v_lshr_b64 v[10:11], v[10:11], v32 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v4, vcc ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v5, vcc ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 ; GISEL-NEXT: v_or_b32_e32 v3, v3, v17 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v2, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, v3, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v3, v7, vcc ; GISEL-NEXT: v_mov_b32_e32 v7, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 @@ -754,20 +768,20 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: .LBB0_9: ; %udiv-do-while ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], 1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[10:11], 1 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v13 -; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v11 -; GISEL-NEXT: v_lshl_b64 v[12:13], v[14:15], 1 -; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v11 +; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v13 +; GISEL-NEXT: v_lshl_b64 v[10:11], v[14:15], 1 +; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v15 ; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26 ; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc ; GISEL-NEXT: v_or_b32_e32 v16, v16, v6 ; GISEL-NEXT: v_or_b32_e32 v2, v2, v34 -; GISEL-NEXT: v_or_b32_e32 v10, v10, v14 -; GISEL-NEXT: v_or_b32_e32 v14, v0, v12 -; GISEL-NEXT: v_or_b32_e32 v15, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v12, v12, v14 +; GISEL-NEXT: v_or_b32_e32 v14, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v15, v1, v11 ; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v2 @@ -780,30 +794,30 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v6, 1, v0 -; GISEL-NEXT: v_and_b32_e32 v12, v0, v22 -; GISEL-NEXT: v_and_b32_e32 v13, v0, v23 +; GISEL-NEXT: v_and_b32_e32 v10, v0, v22 +; GISEL-NEXT: v_and_b32_e32 v11, v0, v23 ; GISEL-NEXT: v_and_b32_e32 v34, v0, v4 ; GISEL-NEXT: v_and_b32_e32 v35, v0, v5 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, v6 ; GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v12 -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v2, v10 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v3, v11, vcc ; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v34, vcc ; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB0_9 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB0_9 ; GISEL-NEXT: ; %bb.10: ; %Flow -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB0_11: ; %Flow11 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1 -; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v15 -; GISEL-NEXT: v_or_b32_e32 v10, v10, v4 +; GISEL-NEXT: v_or_b32_e32 v12, v12, v4 ; GISEL-NEXT: v_or_b32_e32 v14, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v15, v1, v3 -; GISEL-NEXT: .LBB0_12: ; %Flow12 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB0_12: ; %udiv-end ; GISEL-NEXT: v_xor_b32_e32 v3, v25, v24 ; GISEL-NEXT: v_xor_b32_e32 v7, v19, v18 ; GISEL-NEXT: v_xor_b32_e32 v0, v20, v3 @@ -812,8 +826,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_xor_b32_e32 v6, v9, v3 ; GISEL-NEXT: v_xor_b32_e32 v4, v14, v7 ; GISEL-NEXT: v_xor_b32_e32 v5, v15, v7 -; GISEL-NEXT: v_xor_b32_e32 v8, v10, v7 -; GISEL-NEXT: v_xor_b32_e32 v9, v11, v7 +; GISEL-NEXT: v_xor_b32_e32 v8, v12, v7 +; GISEL-NEXT: v_xor_b32_e32 v9, v13, v7 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v7 @@ -831,6 +845,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-LABEL: v_udiv_v2i128_vv: ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_or_b32_e32 v17, v9, v11 ; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 ; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 @@ -844,7 +859,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v26, v0 ; SDAG-NEXT: v_ffbh_u32_e32 v27, v1 ; SDAG-NEXT: v_mov_b32_e32 v28, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] ; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 @@ -866,18 +881,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_sub_i32_e32 v23, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v23 -; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[23:24] +; SDAG-NEXT: v_sub_i32_e32 v21, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v21 +; SDAG-NEXT: v_subbrev_u32_e32 v23, vcc, 0, v28, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[21:22] ; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v26, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v16, v25 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[25:26] +; SDAG-NEXT: v_subbrev_u32_e32 v24, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v16, v23 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[23:24] ; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v24, v26 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] +; SDAG-NEXT: v_or_b32_e32 v17, v22, v24 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[23:24] ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] ; SDAG-NEXT: v_and_b32_e32 v16, 1, v18 @@ -887,44 +902,46 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v1, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec ; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB1_6 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v23 -; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v23 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_mov_b32_e32 v22, 0 -; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v24, vcc +; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v21 +; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v21 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v22, vcc ; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 -; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v25, vcc -; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v26, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v18, v28 -; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v23 -; SDAG-NEXT: v_or_b32_e32 v20, v27, v29 -; SDAG-NEXT: v_lshl_b64 v[23:24], v[2:3], v30 -; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v30 -; SDAG-NEXT: v_lshl_b64 v[25:26], v[0:1], v30 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20] -; SDAG-NEXT: v_lshr_b64 v[19:20], v[0:1], v31 -; SDAG-NEXT: v_or_b32_e32 v20, v24, v20 -; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v19, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v26, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 +; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v23, vcc +; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v24, vcc +; SDAG-NEXT: v_or_b32_e32 v22, v18, v28 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v21 +; SDAG-NEXT: v_or_b32_e32 v23, v27, v29 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v26 +; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 64, v26 +; SDAG-NEXT: v_lshl_b64 v[30:31], v[0:1], v26 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23] +; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v21 +; SDAG-NEXT: v_or_b32_e32 v22, v25, v22 +; SDAG-NEXT: v_or_b32_e32 v21, v24, v21 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v22, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v31, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v30, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB1_5 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v18 +; SDAG-NEXT: v_lshr_b64 v[19:20], v[0:1], v18 ; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v18 ; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v18 ; SDAG-NEXT: v_lshr_b64 v[32:33], v[2:3], v18 @@ -932,8 +949,8 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_mov_b64 s[12:13], 0 ; SDAG-NEXT: v_mov_b32_e32 v25, 0 ; SDAG-NEXT: v_mov_b32_e32 v26, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 ; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v31 @@ -941,18 +958,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v33, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v32, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v22, v22, v35 -; SDAG-NEXT: v_or_b32_e32 v21, v21, v34 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v35 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v34 ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v22, v37, v22, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v37, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v36, v19, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v1, v22, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v0, v21, v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v20, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v19, v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshrrev_b32_e32 v21, 31, v24 +; SDAG-NEXT: v_lshrrev_b32_e32 v19, 31, v24 ; SDAG-NEXT: v_lshl_b64 v[23:24], v[23:24], 1 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v1 @@ -963,17 +980,17 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 ; SDAG-NEXT: v_or_b32_e32 v2, v2, v34 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v35 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v21 -; SDAG-NEXT: v_sub_i32_e32 v21, vcc, v30, v0 -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v31, v1, vcc -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v32, v2, vcc -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v33, v3, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v21 -; SDAG-NEXT: v_and_b32_e32 v25, v21, v8 -; SDAG-NEXT: v_and_b32_e32 v26, v21, v9 -; SDAG-NEXT: v_and_b32_e32 v34, v21, v10 -; SDAG-NEXT: v_and_b32_e32 v35, v21, v11 -; SDAG-NEXT: v_and_b32_e32 v21, 1, v21 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v19 +; SDAG-NEXT: v_sub_i32_e32 v19, vcc, v30, v0 +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v31, v1, vcc +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v32, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v33, v3, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v19, 31, v19 +; SDAG-NEXT: v_and_b32_e32 v25, v19, v8 +; SDAG-NEXT: v_and_b32_e32 v26, v19, v9 +; SDAG-NEXT: v_and_b32_e32 v34, v19, v10 +; SDAG-NEXT: v_and_b32_e32 v35, v19, v11 +; SDAG-NEXT: v_and_b32_e32 v19, 1, v19 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v25 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v26, vcc ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v34, vcc @@ -985,27 +1002,28 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v25, v18, v28 ; SDAG-NEXT: v_or_b32_e32 v26, v27, v29 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] -; SDAG-NEXT: v_or_b32_e32 v17, v20, v17 +; SDAG-NEXT: v_or_b32_e32 v17, v22, v17 ; SDAG-NEXT: s_or_b64 s[12:13], vcc, s[12:13] -; SDAG-NEXT: v_or_b32_e32 v16, v19, v16 -; SDAG-NEXT: v_mov_b32_e32 v26, v22 -; SDAG-NEXT: v_mov_b32_e32 v25, v21 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[12:13] -; SDAG-NEXT: s_cbranch_execnz .LBB1_3 +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[12:13] +; SDAG-NEXT: v_or_b32_e32 v16, v21, v16 +; SDAG-NEXT: v_mov_b32_e32 v26, v20 +; SDAG-NEXT: v_mov_b32_e32 v25, v19 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; SDAG-NEXT: s_cbranch_scc1 .LBB1_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: .LBB1_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB1_5: ; %Flow14 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[16:17], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v24 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[23:24], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 -; SDAG-NEXT: v_or_b32_e32 v16, v20, v1 -; SDAG-NEXT: v_or_b32_e32 v18, v22, v3 -; SDAG-NEXT: v_or_b32_e32 v17, v19, v0 -; SDAG-NEXT: v_or_b32_e32 v19, v21, v2 -; SDAG-NEXT: .LBB1_6: ; %Flow16 +; SDAG-NEXT: v_or_b32_e32 v16, v22, v1 +; SDAG-NEXT: v_or_b32_e32 v18, v20, v3 +; SDAG-NEXT: v_or_b32_e32 v17, v21, v0 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v2 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB1_6: ; %udiv-end1 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_or_b32_e32 v1, v13, v15 ; SDAG-NEXT: v_or_b32_e32 v0, v12, v14 ; SDAG-NEXT: v_or_b32_e32 v3, v5, v7 @@ -1019,7 +1037,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v22, v4 ; SDAG-NEXT: v_ffbh_u32_e32 v23, v5 ; SDAG-NEXT: v_mov_b32_e32 v24, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] ; SDAG-NEXT: v_add_i32_e64 v0, s[6:7], 32, v8 @@ -1045,7 +1063,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc ; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v0 ; SDAG-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v24, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[0:1] ; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v24, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 @@ -1062,10 +1080,11 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v10, v5, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec ; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB1_12 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v8, vcc, 1, v0 ; SDAG-NEXT: v_sub_i32_e64 v9, s[4:5], 63, v0 @@ -1085,19 +1104,20 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[0:1], v[4:5], v0 ; SDAG-NEXT: v_or_b32_e32 v1, v23, v1 ; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v3 ; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v27, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v26, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 ; SDAG-NEXT: v_cndmask_b32_e64 v3, v2, v7, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v9, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: v_mov_b32_e32 v10, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB1_11 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader ; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v8 ; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v8 @@ -1162,15 +1182,15 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v30, v8, v24 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31] ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 ; SDAG-NEXT: v_mov_b32_e32 v23, v21 ; SDAG-NEXT: v_mov_b32_e32 v22, v20 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB1_9 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; SDAG-NEXT: s_cbranch_scc1 .LBB1_9 ; SDAG-NEXT: ; %bb.10: ; %Flow -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB1_11: ; %Flow11 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1179,8 +1199,8 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v10, v21, v1 ; SDAG-NEXT: v_or_b32_e32 v9, v9, v2 ; SDAG-NEXT: v_or_b32_e32 v11, v20, v0 -; SDAG-NEXT: .LBB1_12: ; %Flow12 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB1_12: ; %udiv-end ; SDAG-NEXT: v_mov_b32_e32 v0, v19 ; SDAG-NEXT: v_mov_b32_e32 v1, v18 ; SDAG-NEXT: v_mov_b32_e32 v2, v17 @@ -1196,6 +1216,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v16, v2 ; GISEL-NEXT: v_mov_b32_e32 v17, v3 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v2, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v3, v9, v11 @@ -1249,14 +1270,15 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v2, v3, v2 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v24, 1, v2 +; GISEL-NEXT: v_and_b32_e32 v3, 1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v2, v16, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v3 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB1_6 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 ; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v20 ; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v21, vcc @@ -1275,19 +1297,21 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v3, vcc ; GISEL-NEXT: v_or_b32_e32 v2, v20, v18 ; GISEL-NEXT: v_or_b32_e32 v3, v21, v19 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v21, s11 ; GISEL-NEXT: v_mov_b32_e32 v20, s10 ; GISEL-NEXT: v_mov_b32_e32 v19, s9 ; GISEL-NEXT: v_mov_b32_e32 v18, s8 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB1_5 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 ; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26 ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v26 @@ -1347,6 +1371,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v21, v0, v10 ; GISEL-NEXT: v_and_b32_e32 v35, v0, v11 ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v20, v18 ; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v19, vcc ; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v21, vcc @@ -1354,20 +1379,20 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v2, v2, v34 ; GISEL-NEXT: v_mov_b32_e32 v19, v1 ; GISEL-NEXT: v_mov_b32_e32 v18, v0 -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB1_3 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB1_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB1_5: ; %Flow14 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[0:1], v[22:23], 1 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v8, 31, v23 ; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 ; GISEL-NEXT: v_or_b32_e32 v18, v18, v0 ; GISEL-NEXT: v_or_b32_e32 v19, v19, v1 -; GISEL-NEXT: .LBB1_6: ; %Flow16 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB1_6: ; %udiv-end1 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v0, v12, v14 ; GISEL-NEXT: v_or_b32_e32 v1, v13, v15 @@ -1421,14 +1446,15 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v8, v9, v8 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v4, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v20, 1, v8 +; GISEL-NEXT: v_and_b32_e32 v9, 1, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v11, v5, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, v6, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e64 v9, v7, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB1_12 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 ; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v1, vcc @@ -1447,19 +1473,21 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v20, v16 ; GISEL-NEXT: v_or_b32_e32 v1, v21, v17 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v23, s11 ; GISEL-NEXT: v_mov_b32_e32 v22, s10 ; GISEL-NEXT: v_mov_b32_e32 v21, s9 ; GISEL-NEXT: v_mov_b32_e32 v20, s8 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB1_11 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader ; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v8 @@ -1520,26 +1548,26 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v30, v6, v13 ; GISEL-NEXT: v_and_b32_e32 v31, v6, v14 ; GISEL-NEXT: v_and_b32_e32 v32, v6, v15 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v21, v5 ; GISEL-NEXT: v_mov_b32_e32 v20, v4 ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v22, v7 ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v23, v30, vcc ; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v31, vcc ; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v32, vcc -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB1_9 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB1_9 ; GISEL-NEXT: ; %bb.10: ; %Flow -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB1_11: ; %Flow11 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], 1 ; GISEL-NEXT: v_lshl_b64 v[8:9], v[0:1], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v10 ; GISEL-NEXT: v_or_b32_e32 v8, v8, v0 ; GISEL-NEXT: v_or_b32_e32 v10, v20, v4 ; GISEL-NEXT: v_or_b32_e32 v11, v21, v5 -; GISEL-NEXT: .LBB1_12: ; %Flow12 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB1_12: ; %udiv-end ; GISEL-NEXT: v_mov_b32_e32 v0, v18 ; GISEL-NEXT: v_mov_b32_e32 v1, v19 ; GISEL-NEXT: v_mov_b32_e32 v4, v10 @@ -1556,10 +1584,11 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SDAG-NEXT: s_mov_b64 s[10:11], exec ; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: s_mov_b64 s[12:13], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v29, v28 ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc @@ -1615,7 +1644,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v10, v21, vcc ; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v8 ; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v19, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[12:13], v[8:9] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v19, vcc ; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 @@ -1632,10 +1661,11 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v27, v17, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec ; SDAG-NEXT: v_cndmask_b32_e64 v33, v16, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_6 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 ; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v8 ; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8 @@ -1655,26 +1685,27 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25 ; SDAG-NEXT: v_or_b32_e32 v9, v9, v19 ; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 ; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 ; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_5 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 ; SDAG-NEXT: v_lshr_b64 v[10:11], v[16:17], v32 ; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v32 ; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32 ; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32 ; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0 ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: v_mov_b32_e32 v18, 0 @@ -1731,16 +1762,16 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v49, v33, v35 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[48:49] ; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 -; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] ; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 ; SDAG-NEXT: v_mov_b32_e32 v23, v11 ; SDAG-NEXT: v_mov_b32_e32 v22, v10 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB2_3 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; SDAG-NEXT: s_cbranch_scc1 .LBB2_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB2_5: ; %Flow14 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 @@ -1749,12 +1780,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v27, v11, v21 ; SDAG-NEXT: v_or_b32_e32 v32, v18, v8 ; SDAG-NEXT: v_or_b32_e32 v33, v10, v20 -; SDAG-NEXT: .LBB2_6: ; %Flow16 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB2_6: ; %udiv-end1 +; SDAG-NEXT: s_mov_b64 s[10:11], exec ; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7 ; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 ; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: s_mov_b64 s[12:13], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v34, v26 ; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc ; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v6, vcc @@ -1810,7 +1842,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v13, v12, vcc ; SDAG-NEXT: v_xor_b32_e32 v14, 0x7f, v10 ; SDAG-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v18, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[12:13], v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v18, vcc ; SDAG-NEXT: v_or_b32_e32 v14, v14, v12 @@ -1827,10 +1859,11 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v15, v9, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec ; SDAG-NEXT: v_cndmask_b32_e64 v14, v8, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_12 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v10 ; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v10 @@ -1850,26 +1883,27 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[10:11], v[8:9], v10 ; SDAG-NEXT: v_or_b32_e32 v11, v21, v11 ; SDAG-NEXT: v_or_b32_e32 v10, v20, v10 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v13 ; SDAG-NEXT: v_cndmask_b32_e64 v12, v19, v11, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v10, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v22, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 ; SDAG-NEXT: v_cndmask_b32_e64 v13, v12, v5, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v12, v18, v4, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_11 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader ; SDAG-NEXT: v_lshr_b64 v[14:15], v[8:9], v38 ; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 64, v38 ; SDAG-NEXT: v_subrev_i32_e32 v51, vcc, 64, v38 ; SDAG-NEXT: v_lshr_b64 v[22:23], v[4:5], v38 ; SDAG-NEXT: v_add_i32_e32 v50, vcc, -1, v37 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0 ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: v_mov_b32_e32 v18, 0 @@ -1926,16 +1960,16 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v55, v39, v49 ; SDAG-NEXT: v_or_b32_e32 v54, v38, v48 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[54:55] -; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] ; SDAG-NEXT: v_or_b32_e32 v10, v20, v10 ; SDAG-NEXT: v_mov_b32_e32 v21, v15 ; SDAG-NEXT: v_mov_b32_e32 v20, v14 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB2_9 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; SDAG-NEXT: s_cbranch_scc1 .LBB2_9 ; SDAG-NEXT: ; %bb.10: ; %Flow -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB2_11: ; %Flow11 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v11 ; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 @@ -1944,8 +1978,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v15, v15, v11 ; SDAG-NEXT: v_or_b32_e32 v18, v18, v12 ; SDAG-NEXT: v_or_b32_e32 v14, v14, v10 -; SDAG-NEXT: .LBB2_12: ; %Flow12 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB2_12: ; %udiv-end ; SDAG-NEXT: v_mul_lo_u32 v12, v33, v3 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0 ; SDAG-NEXT: v_mul_lo_u32 v24, v27, v2 @@ -2021,6 +2055,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-LABEL: v_srem_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v28, 31, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v20, 31, v11 @@ -2092,14 +2127,15 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v18, v19, v18 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v31, v16, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v20, 1, v18 +; GISEL-NEXT: v_and_b32_e32 v19, 1, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v18, v8, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v19 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 ; GISEL-NEXT: v_add_i32_e32 v31, vcc, 1, v0 ; GISEL-NEXT: v_addc_u32_e64 v32, s[4:5], 0, v1, vcc @@ -2118,19 +2154,21 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v18, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v19, v3 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; GISEL-NEXT: v_cndmask_b32_e32 v18, v0, v8, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v19, v1, v9, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 ; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v31 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v31 @@ -2191,26 +2229,27 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v25, v0, v29 ; GISEL-NEXT: v_and_b32_e32 v26, v0, v10 ; GISEL-NEXT: v_and_b32_e32 v0, v0, v11 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1 ; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v49, v25, vcc ; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc ; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v22 ; GISEL-NEXT: v_mov_b32_e32 v1, v23 -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB2_3 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB2_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB2_5: ; %Flow14 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v20, 31, v21 ; GISEL-NEXT: v_or_b32_e32 v18, v18, v20 ; GISEL-NEXT: v_or_b32_e32 v31, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v32, v1, v3 -; GISEL-NEXT: .LBB2_6: ; %Flow16 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB2_6: ; %udiv-end1 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v33, 31, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v15 @@ -2282,14 +2321,15 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v2, v3, v2 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, v12, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v22, 1, v2 +; GISEL-NEXT: v_and_b32_e32 v3, 1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v21, v13, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v3 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB2_12 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 ; GISEL-NEXT: v_add_i32_e32 v36, vcc, 1, v0 ; GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v1, vcc @@ -2308,19 +2348,21 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v14, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v15, v3 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; GISEL-NEXT: v_cndmask_b32_e32 v14, v0, v6, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v15, v1, v7, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB2_11 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader ; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v36 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v36 @@ -2381,26 +2423,26 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v25, v0, v34 ; GISEL-NEXT: v_and_b32_e32 v26, v0, v4 ; GISEL-NEXT: v_and_b32_e32 v52, v0, v5 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1 ; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v53, v25, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v22 ; GISEL-NEXT: v_mov_b32_e32 v1, v23 ; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc ; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB2_9 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB2_9 ; GISEL-NEXT: ; %bb.10: ; %Flow -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB2_11: ; %Flow11 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], 1 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v21 ; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 ; GISEL-NEXT: v_or_b32_e32 v20, v0, v22 ; GISEL-NEXT: v_or_b32_e32 v21, v1, v23 -; GISEL-NEXT: .LBB2_12: ; %Flow12 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB2_12: ; %udiv-end ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0 ; GISEL-NEXT: v_mul_lo_u32 v24, v30, v19 @@ -2460,6 +2502,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-LABEL: v_urem_v2i128_vv: ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_or_b32_e32 v17, v9, v11 ; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 ; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 @@ -2473,7 +2516,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v26, v0 ; SDAG-NEXT: v_ffbh_u32_e32 v27, v1 ; SDAG-NEXT: v_mov_b32_e32 v28, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] ; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 @@ -2499,7 +2542,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc ; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 ; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[16:17] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 @@ -2516,10 +2559,11 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v31, v2, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v30, v1, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec ; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB3_6 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 ; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v16 ; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 @@ -2539,19 +2583,20 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v27 ; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 ; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v23, v17, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v16, v22, v16, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v24, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB3_5 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 ; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v30 ; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v30 @@ -2616,15 +2661,15 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] ; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; SDAG-NEXT: v_or_b32_e32 v22, v24, v22 ; SDAG-NEXT: v_mov_b32_e32 v25, v19 ; SDAG-NEXT: v_mov_b32_e32 v24, v18 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB3_3 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; SDAG-NEXT: s_cbranch_scc1 .LBB3_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB3_5: ; %Flow14 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v23 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 @@ -2633,8 +2678,9 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v30, v19, v23 ; SDAG-NEXT: v_or_b32_e32 v31, v20, v16 ; SDAG-NEXT: v_or_b32_e32 v32, v18, v22 -; SDAG-NEXT: .LBB3_6: ; %Flow16 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB3_6: ; %udiv-end1 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_or_b32_e32 v17, v13, v15 ; SDAG-NEXT: v_or_b32_e32 v16, v12, v14 ; SDAG-NEXT: v_or_b32_e32 v19, v5, v7 @@ -2648,7 +2694,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v26, v4 ; SDAG-NEXT: v_ffbh_u32_e32 v27, v5 ; SDAG-NEXT: v_mov_b32_e32 v28, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] ; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 @@ -2674,7 +2720,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc ; SDAG-NEXT: v_xor_b32_e32 v20, 0x7f, v16 ; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[16:17] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v20, v20, v18 @@ -2691,10 +2737,11 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v21, v5, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec ; SDAG-NEXT: v_cndmask_b32_e64 v20, v4, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB3_12 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v16 ; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 @@ -2714,19 +2761,20 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[16:17], v[4:5], v16 ; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 ; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v19 ; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v17, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v16, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v27, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v26, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 ; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v7, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB3_11 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader ; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v34 ; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v34 @@ -2791,15 +2839,15 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v50, v34, v36 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[50:51] ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 ; SDAG-NEXT: v_mov_b32_e32 v25, v21 ; SDAG-NEXT: v_mov_b32_e32 v24, v20 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB3_9 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; SDAG-NEXT: s_cbranch_scc1 .LBB3_9 ; SDAG-NEXT: ; %bb.10: ; %Flow -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB3_11: ; %Flow11 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v17 ; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 @@ -2808,8 +2856,8 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v21, v21, v17 ; SDAG-NEXT: v_or_b32_e32 v22, v22, v18 ; SDAG-NEXT: v_or_b32_e32 v20, v20, v16 -; SDAG-NEXT: .LBB3_12: ; %Flow12 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB3_12: ; %udiv-end ; SDAG-NEXT: v_mul_lo_u32 v18, v32, v11 ; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v32, v10, 0 ; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10 @@ -2870,6 +2918,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-LABEL: v_urem_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v16, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v17, v9, v11 @@ -2923,14 +2972,15 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v20, v21, v20 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; GISEL-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v22, 1, v20 +; GISEL-NEXT: v_and_b32_e32 v21, 1, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v20, v2, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v21 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB3_6 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 ; GISEL-NEXT: v_add_i32_e32 v30, vcc, 1, v16 ; GISEL-NEXT: v_addc_u32_e64 v31, s[4:5], 0, v17, vcc @@ -2949,19 +2999,21 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v17, vcc ; GISEL-NEXT: v_or_b32_e32 v16, v20, v18 ; GISEL-NEXT: v_or_b32_e32 v17, v21, v19 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v16, v24, v16, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v17, v25, v17, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 ; GISEL-NEXT: v_cndmask_b32_e32 v20, v16, v2, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v21, v17, v3, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v19, s11 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v16, s8 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB3_5 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 ; GISEL-NEXT: v_subrev_i32_e32 v26, vcc, 64, v30 ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v30 @@ -3022,26 +3074,27 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v27, v16, v9 ; GISEL-NEXT: v_and_b32_e32 v28, v16, v10 ; GISEL-NEXT: v_and_b32_e32 v16, v16, v11 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v19, v17 ; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v39, v27, vcc ; GISEL-NEXT: v_subb_u32_e32 v28, vcc, v18, v28, vcc ; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v16, vcc ; GISEL-NEXT: v_mov_b32_e32 v16, v24 ; GISEL-NEXT: v_mov_b32_e32 v17, v25 -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB3_3 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB3_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB3_5: ; %Flow14 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 ; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v23 ; GISEL-NEXT: v_or_b32_e32 v20, v20, v22 ; GISEL-NEXT: v_or_b32_e32 v32, v16, v18 ; GISEL-NEXT: v_or_b32_e32 v33, v17, v19 -; GISEL-NEXT: .LBB3_6: ; %Flow16 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB3_6: ; %udiv-end1 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v16, v12, v14 ; GISEL-NEXT: v_or_b32_e32 v17, v13, v15 @@ -3095,14 +3148,15 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v18, v19, v18 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 ; GISEL-NEXT: v_cndmask_b32_e64 v24, v4, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v26, 1, v18 +; GISEL-NEXT: v_and_b32_e32 v19, 1, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v25, v5, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v18, v6, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v19 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e64 v19, v7, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB3_12 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 ; GISEL-NEXT: v_add_i32_e32 v34, vcc, 1, v16 ; GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v17, vcc @@ -3121,19 +3175,21 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v17, vcc ; GISEL-NEXT: v_or_b32_e32 v16, v22, v18 ; GISEL-NEXT: v_or_b32_e32 v17, v23, v19 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v16, v26, v16, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v17, v27, v17, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28 ; GISEL-NEXT: v_cndmask_b32_e32 v22, v16, v6, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v23, v17, v7, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v19, s11 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v16, s8 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB3_11 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader ; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v34 ; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 64, v34 @@ -3194,26 +3250,26 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v29, v16, v13 ; GISEL-NEXT: v_and_b32_e32 v30, v16, v14 ; GISEL-NEXT: v_and_b32_e32 v50, v16, v15 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v19, v17 ; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v51, v29, vcc ; GISEL-NEXT: v_mov_b32_e32 v16, v26 ; GISEL-NEXT: v_mov_b32_e32 v17, v27 ; GISEL-NEXT: v_subb_u32_e32 v30, vcc, v18, v30, vcc ; GISEL-NEXT: v_subb_u32_e32 v31, vcc, v31, v50, vcc -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB3_9 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB3_9 ; GISEL-NEXT: ; %bb.10: ; %Flow -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB3_11: ; %Flow11 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[26:27], v[24:25], 1 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v25 ; GISEL-NEXT: v_or_b32_e32 v18, v18, v22 ; GISEL-NEXT: v_or_b32_e32 v24, v16, v26 ; GISEL-NEXT: v_or_b32_e32 v25, v17, v27 -; GISEL-NEXT: .LBB3_12: ; %Flow12 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB3_12: ; %udiv-end ; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0 ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0 ; GISEL-NEXT: v_mul_lo_u32 v28, v8, v21 diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll index 757458363284c6..9a7f1eb0b052e8 100644 --- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -29,7 +29,6 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: s_branch .LBB0_3 ; ISA-NEXT: .LBB0_1: ; %Flow1 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; ISA-NEXT: s_or_b64 exec, exec, s[6:7] ; ISA-NEXT: s_mov_b64 s[6:7], 0 ; ISA-NEXT: .LBB0_2: ; %Flow ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 @@ -38,8 +37,9 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; ISA-NEXT: s_and_b64 s[6:7], s[6:7], exec ; ISA-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] -; ISA-NEXT: s_andn2_b64 exec, exec, s[0:1] -; ISA-NEXT: s_cbranch_execz .LBB0_6 +; ISA-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; ISA-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; ISA-NEXT: s_cbranch_scc0 .LBB0_6 ; ISA-NEXT: .LBB0_3: ; %loop ; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 ; ISA-NEXT: s_or_b64 s[4:5], s[4:5], exec @@ -48,22 +48,27 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: s_cbranch_scc0 .LBB0_2 ; ISA-NEXT: ; %bb.4: ; %endif1 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; ISA-NEXT: s_mov_b64 s[6:7], exec +; ISA-NEXT: s_and_b64 s[10:11], vcc, exec ; ISA-NEXT: s_mov_b64 s[4:5], -1 -; ISA-NEXT: s_and_saveexec_b64 s[6:7], vcc -; ISA-NEXT: s_cbranch_execz .LBB0_1 +; ISA-NEXT: s_cmov_b64 exec, s[10:11] +; ISA-NEXT: s_cbranch_scc0 .LBB0_1 ; ISA-NEXT: ; %bb.5: ; %endif2 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; ISA-NEXT: s_add_i32 s8, s8, 1 ; ISA-NEXT: s_xor_b64 s[4:5], exec, -1 +; ISA-NEXT: s_or_b64 exec, exec, s[6:7] ; ISA-NEXT: s_branch .LBB0_1 ; ISA-NEXT: .LBB0_6: ; %Flow2 -; ISA-NEXT: s_or_b64 exec, exec, s[0:1] +; ISA-NEXT: s_mov_b64 s[0:1], exec +; ISA-NEXT: s_and_b64 s[2:3], s[2:3], exec ; ISA-NEXT: v_mov_b32_e32 v1, 0 -; ISA-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] +; ISA-NEXT: s_cmov_b64 exec, s[2:3] +; ISA-NEXT: s_cbranch_scc0 .LBB0_8 ; ISA-NEXT: ; %bb.7: ; %if1 ; ISA-NEXT: v_sqrt_f32_e32 v1, v0 -; ISA-NEXT: ; %bb.8: ; %endloop ; ISA-NEXT: s_or_b64 exec, exec, s[0:1] +; ISA-NEXT: .LBB0_8: ; %endloop ; ISA-NEXT: exp mrt0 v1, v1, v1, v1 done vm ; ISA-NEXT: s_endpgm start: diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir index 179d0becf6693a..85a81e96b3be57 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir @@ -430,9 +430,9 @@ body: | bb.1: successors: %bb.2 + SI_WAVE_RECONVERGE %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: - SI_END_CF %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ... # GCN-LABEL: name: old_in_diff_bb diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir index 1151bde02ef62c..32a97b7c144ec7 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir @@ -383,9 +383,9 @@ body: | bb.1: successors: %bb.2 + SI_WAVE_RECONVERGE %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: - SI_END_CF %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ... # GCN-LABEL: name: old_in_diff_bb diff --git a/llvm/test/CodeGen/AMDGPU/early-tailduplicator-terminator.mir b/llvm/test/CodeGen/AMDGPU/early-tailduplicator-terminator.mir index 41c6906b3c85ad..ee423a911400a5 100644 --- a/llvm/test/CodeGen/AMDGPU/early-tailduplicator-terminator.mir +++ b/llvm/test/CodeGen/AMDGPU/early-tailduplicator-terminator.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=early-tailduplication -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass=early-tailduplication -verify-machineinstrs -o - %s | FileCheck %s # Early tail duplication should not merge bb.6 into bb.5, adding a # non-terminator (S_SLEEP) after the terminator S_MOV_B32_term. @@ -22,19 +22,16 @@ body: | ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cond:sreg_32_xm0_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.3, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term %cond, $exec_lo, implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_MOV_B32_term]], %bb.3, implicit $exec, implicit $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: - ; CHECK-NEXT: successors: %bb.5(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_SLEEP 2 @@ -46,13 +43,14 @@ body: | S_SLEEP 1 bb.3: + %cond:sreg_32_xm0_xexec = IMPLICIT_DEF %0:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo bb.4: - SI_WATERFALL_LOOP %bb.4, implicit $exec + %1:sreg_32_xm0_xexec = S_XOR_B32_term %cond, $exec_lo, implicit def $scc + SI_WATERFALL_LOOP %1, %0, %bb.4, implicit $exec, implicit $scc bb.5: - $exec_lo = S_MOV_B32_term %0 bb.6: S_SLEEP 2 diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll index 36a93bd2511ced..bccb94f55c01b8 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll @@ -8,10 +8,12 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -36,9 +38,12 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_or_b32_e32 v4, v0, v1 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB0_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_4 +; GCN-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %T ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -62,8 +67,8 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GCN-NEXT: v_or_b32_e32 v4, v2, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB0_4: ; %exit -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_ashrrev_i32_e32 v0, 16, v4 ; GCN-NEXT: v_bfe_i32 v1, v4, 0, 16 ; GCN-NEXT: v_mov_b32_e32 v2, 0xffff @@ -102,10 +107,12 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -120,9 +127,12 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB1_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_4 +; GCN-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB1_4 ; GCN-NEXT: ; %bb.3: ; %T ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -137,8 +147,8 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB1_4: ; %exit -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] @@ -173,10 +183,12 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -191,9 +203,12 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB2_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_4 +; GCN-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB2_4 ; GCN-NEXT: ; %bb.3: ; %T ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -208,8 +223,8 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB2_4: ; %exit -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] @@ -250,10 +265,12 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -276,9 +293,12 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB3_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_4 +; GCN-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB3_4 ; GCN-NEXT: ; %bb.3: ; %T ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -300,8 +320,8 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB3_4: ; %exit -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] ; GCN-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[8:9] @@ -354,10 +374,12 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -372,9 +394,12 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB4_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_4 +; GCN-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB4_4 ; GCN-NEXT: ; %bb.3: ; %T ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -389,8 +414,8 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB4_4: ; %exit -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000 ; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[4:5] @@ -425,10 +450,12 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB5_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -443,9 +470,12 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB5_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_4 +; GCN-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB5_4 ; GCN-NEXT: ; %bb.3: ; %T ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -460,8 +490,8 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB5_4: ; %exit -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000 ; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5] @@ -502,10 +532,12 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB6_2 +; GCN-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB6_2 ; GCN-NEXT: ; %bb.1: ; %F ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -528,9 +560,12 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 +; GCN-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB6_4 ; GCN-NEXT: ; %bb.3: ; %T ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 @@ -552,8 +587,8 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB6_4: ; %exit -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000 ; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7] ; GCN-NEXT: v_cmp_nlt_f64_e64 s[4:5], -1.0, v[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 422c8a0be23b49..377b6be97009bc 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -63,10 +63,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -75,26 +75,31 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB0_6 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB0_6 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB0_3 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: .LBB0_3: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB0_5 +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB0_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -102,20 +107,23 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: .LBB0_5: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: .LBB0_6: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB0_8 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB0_8 ; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB0_8: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB0_8: ; %atomicrmw.end ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -135,10 +143,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -157,10 +165,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -179,10 +187,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -242,10 +250,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -255,52 +263,55 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_8 -; GFX90A-NEXT: .LBB1_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB1_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB1_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB1_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB1_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB1_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 ; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GFX90A-NEXT: .LBB1_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB1_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB1_2 -; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB1_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc ; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -319,10 +330,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -343,10 +354,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -366,10 +377,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -434,10 +445,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -447,52 +458,55 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_8 -; GFX90A-NEXT: .LBB2_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB2_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB2_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB2_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB2_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB2_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB2_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB2_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 ; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GFX90A-NEXT: .LBB2_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB2_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB2_2 -; GFX90A-NEXT: .LBB2_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB2_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB2_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc ; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB2_8: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -515,10 +529,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -538,10 +552,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -561,10 +575,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -623,10 +637,10 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -634,50 +648,53 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_8 -; GFX90A-NEXT: .LBB3_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB3_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB3_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB3_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB3_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB3_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB3_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB3_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: .LBB3_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB3_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB3_2 -; GFX90A-NEXT: .LBB3_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB3_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB3_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB3_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -686,50 +703,53 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_3 -; GFX908-NEXT: ; %bb.1: ; %Flow2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_8 -; GFX908-NEXT: .LBB3_2: ; %atomicrmw.phi -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; GFX908-NEXT: .LBB3_3: ; %atomicrmw.check.private +; GFX908-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB3_6 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB3_5 -; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX908-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: .LBB3_5: ; %Flow -; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB3_7 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB3_3: ; %Flow +; GFX908-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX908-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[6:7] +; GFX908-NEXT: s_cbranch_scc0 .LBB3_5 +; GFX908-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX908-NEXT: .LBB3_7: ; %Flow1 -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: .LBB3_5: ; %Flow1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB3_2 -; GFX908-NEXT: .LBB3_8: ; %atomicrmw.shared +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB3_6: ; %Flow2 +; GFX908-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[4:5] +; GFX908-NEXT: s_cbranch_scc0 .LBB3_8 +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB3_8: ; %atomicrmw.phi ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -747,11 +767,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -768,11 +788,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void @@ -832,10 +852,10 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -845,50 +865,53 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_8 -; GFX90A-NEXT: .LBB4_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB4_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB4_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB4_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB4_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB4_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB4_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB4_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB4_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: .LBB4_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB4_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB4_2 -; GFX90A-NEXT: .LBB4_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB4_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB4_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB4_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -899,50 +922,53 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_3 -; GFX908-NEXT: ; %bb.1: ; %Flow2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_8 -; GFX908-NEXT: .LBB4_2: ; %atomicrmw.phi -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; GFX908-NEXT: .LBB4_3: ; %atomicrmw.check.private +; GFX908-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB4_6 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB4_5 -; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX908-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB4_3 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: .LBB4_5: ; %Flow -; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB4_7 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB4_3: ; %Flow +; GFX908-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX908-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[6:7] +; GFX908-NEXT: s_cbranch_scc0 .LBB4_5 +; GFX908-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX908-NEXT: .LBB4_7: ; %Flow1 -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: .LBB4_5: ; %Flow1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB4_2 -; GFX908-NEXT: .LBB4_8: ; %atomicrmw.shared +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB4_6: ; %Flow2 +; GFX908-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[4:5] +; GFX908-NEXT: s_cbranch_scc0 .LBB4_8 +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB4_8: ; %atomicrmw.phi ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -962,11 +988,11 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -985,11 +1011,11 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -1055,10 +1081,10 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -1068,50 +1094,53 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_8 -; GFX90A-NEXT: .LBB5_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB5_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB5_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB5_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB5_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB5_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB5_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB5_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: .LBB5_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB5_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB5_2 -; GFX90A-NEXT: .LBB5_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB5_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB5_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB5_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1122,50 +1151,53 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_3 -; GFX908-NEXT: ; %bb.1: ; %Flow2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_8 -; GFX908-NEXT: .LBB5_2: ; %atomicrmw.phi -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; GFX908-NEXT: .LBB5_3: ; %atomicrmw.check.private +; GFX908-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB5_6 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB5_5 -; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX908-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: .LBB5_5: ; %Flow -; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB5_7 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB5_3: ; %Flow +; GFX908-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX908-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[6:7] +; GFX908-NEXT: s_cbranch_scc0 .LBB5_5 +; GFX908-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX908-NEXT: .LBB5_7: ; %Flow1 -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: .LBB5_5: ; %Flow1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB5_2 -; GFX908-NEXT: .LBB5_8: ; %atomicrmw.shared +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB5_6: ; %Flow2 +; GFX908-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[4:5] +; GFX908-NEXT: s_cbranch_scc0 .LBB5_8 +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB5_8: ; %atomicrmw.phi ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1185,11 +1217,11 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -1208,11 +1240,11 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 -512 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -1272,10 +1304,10 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -1285,25 +1317,20 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_8 -; GFX90A-NEXT: .LBB6_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB6_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB6_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB6_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB6_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1311,28 +1338,36 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB6_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB6_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB6_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB6_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 ; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GFX90A-NEXT: .LBB6_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB6_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB6_2 -; GFX90A-NEXT: .LBB6_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB6_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB6_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc ; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB6_8: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1351,10 +1386,10 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1375,10 +1410,10 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -1398,10 +1433,10 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -1462,10 +1497,10 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -1475,23 +1510,18 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_8 -; GFX90A-NEXT: .LBB7_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB7_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB7_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB7_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1499,28 +1529,36 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB7_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB7_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB7_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB7_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: .LBB7_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB7_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB7_2 -; GFX90A-NEXT: .LBB7_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB7_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB7_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB7_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1531,50 +1569,53 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_3 -; GFX908-NEXT: ; %bb.1: ; %Flow2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_8 -; GFX908-NEXT: .LBB7_2: ; %atomicrmw.phi -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; GFX908-NEXT: .LBB7_3: ; %atomicrmw.check.private +; GFX908-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB7_6 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB7_5 -; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX908-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: .LBB7_5: ; %Flow -; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB7_7 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB7_3: ; %Flow +; GFX908-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX908-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[6:7] +; GFX908-NEXT: s_cbranch_scc0 .LBB7_5 +; GFX908-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX908-NEXT: .LBB7_7: ; %Flow1 -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: .LBB7_5: ; %Flow1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB7_2 -; GFX908-NEXT: .LBB7_8: ; %atomicrmw.shared +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB7_6: ; %Flow2 +; GFX908-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[4:5] +; GFX908-NEXT: s_cbranch_scc0 .LBB7_8 +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB7_8: ; %atomicrmw.phi ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1594,11 +1635,11 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -1617,11 +1658,11 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -1669,11 +1710,11 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: @@ -1695,10 +1736,10 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: @@ -1715,11 +1756,11 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: @@ -1736,11 +1777,11 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: @@ -1759,11 +1800,11 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: @@ -1782,11 +1823,11 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst @@ -1847,10 +1888,10 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: @@ -1867,11 +1908,11 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: @@ -1888,11 +1929,11 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: @@ -1911,11 +1952,11 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: @@ -1934,11 +1975,11 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1999,10 +2040,10 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -2012,50 +2053,53 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_8 -; GFX90A-NEXT: .LBB10_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB10_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB10_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB10_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB10_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB10_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB10_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB10_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB10_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: .LBB10_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB10_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB10_2 -; GFX90A-NEXT: .LBB10_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB10_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB10_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB10_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2066,50 +2110,53 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_3 -; GFX908-NEXT: ; %bb.1: ; %Flow2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_8 -; GFX908-NEXT: .LBB10_2: ; %atomicrmw.phi -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; GFX908-NEXT: .LBB10_3: ; %atomicrmw.check.private +; GFX908-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB10_6 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB10_5 -; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX908-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB10_3 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: .LBB10_5: ; %Flow -; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB10_7 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB10_3: ; %Flow +; GFX908-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX908-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[6:7] +; GFX908-NEXT: s_cbranch_scc0 .LBB10_5 +; GFX908-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX908-NEXT: .LBB10_7: ; %Flow1 -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: .LBB10_5: ; %Flow1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB10_2 -; GFX908-NEXT: .LBB10_8: ; %atomicrmw.shared +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB10_6: ; %Flow2 +; GFX908-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[4:5] +; GFX908-NEXT: s_cbranch_scc0 .LBB10_8 +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB10_8: ; %atomicrmw.phi ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2129,11 +2176,11 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -2152,11 +2199,11 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -2204,11 +2251,11 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -2230,10 +2277,10 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -2250,11 +2297,11 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -2271,11 +2318,11 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -2294,11 +2341,11 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -2317,11 +2364,11 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 @@ -2383,10 +2430,10 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2395,26 +2442,31 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB12_6 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB12_6 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB12_3 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: .LBB12_3: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB12_5 +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB12_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -2422,20 +2474,23 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: .LBB12_5: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: .LBB12_6: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB12_8 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB12_8 ; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB12_8: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB12_8: ; %atomicrmw.end ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2455,10 +2510,10 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2477,10 +2532,10 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2499,10 +2554,10 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -2562,10 +2617,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2575,52 +2630,55 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_8 -; GFX90A-NEXT: .LBB13_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB13_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB13_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB13_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB13_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB13_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB13_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB13_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 ; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GFX90A-NEXT: .LBB13_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB13_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB13_2 -; GFX90A-NEXT: .LBB13_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB13_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB13_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc ; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB13_8: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2639,10 +2697,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2663,10 +2721,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2686,10 +2744,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -2754,10 +2812,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2767,52 +2825,55 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_8 -; GFX90A-NEXT: .LBB14_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB14_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB14_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB14_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB14_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB14_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB14_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB14_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 ; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GFX90A-NEXT: .LBB14_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB14_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB14_2 -; GFX90A-NEXT: .LBB14_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB14_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB14_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc ; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB14_8: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2835,10 +2896,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2858,10 +2919,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2881,10 +2942,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -2943,10 +3004,10 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2954,50 +3015,53 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_8 -; GFX90A-NEXT: .LBB15_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB15_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB15_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB15_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB15_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB15_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB15_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: .LBB15_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB15_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB15_2 -; GFX90A-NEXT: .LBB15_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB15_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB15_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB15_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3006,50 +3070,53 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_3 -; GFX908-NEXT: ; %bb.1: ; %Flow2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_8 -; GFX908-NEXT: .LBB15_2: ; %atomicrmw.phi -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; GFX908-NEXT: .LBB15_3: ; %atomicrmw.check.private +; GFX908-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB15_6 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB15_5 -; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX908-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: .LBB15_5: ; %Flow -; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB15_7 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB15_3: ; %Flow +; GFX908-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX908-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[6:7] +; GFX908-NEXT: s_cbranch_scc0 .LBB15_5 +; GFX908-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX908-NEXT: .LBB15_7: ; %Flow1 -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: .LBB15_5: ; %Flow1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB15_2 -; GFX908-NEXT: .LBB15_8: ; %atomicrmw.shared +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB15_6: ; %Flow2 +; GFX908-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[4:5] +; GFX908-NEXT: s_cbranch_scc0 .LBB15_8 +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB15_8: ; %atomicrmw.phi ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -3067,11 +3134,11 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -3088,11 +3155,11 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -3152,10 +3219,10 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -3165,50 +3232,53 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_8 -; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB16_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB16_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB16_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB16_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB16_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB16_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB16_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB16_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: .LBB16_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB16_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB16_2 -; GFX90A-NEXT: .LBB16_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB16_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB16_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB16_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3219,50 +3289,53 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_3 -; GFX908-NEXT: ; %bb.1: ; %Flow2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_8 -; GFX908-NEXT: .LBB16_2: ; %atomicrmw.phi -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; GFX908-NEXT: .LBB16_3: ; %atomicrmw.check.private +; GFX908-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB16_6 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB16_5 -; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX908-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB16_3 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: .LBB16_5: ; %Flow -; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB16_7 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB16_3: ; %Flow +; GFX908-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX908-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[6:7] +; GFX908-NEXT: s_cbranch_scc0 .LBB16_5 +; GFX908-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX908-NEXT: .LBB16_7: ; %Flow1 -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: .LBB16_5: ; %Flow1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB16_2 -; GFX908-NEXT: .LBB16_8: ; %atomicrmw.shared +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB16_6: ; %Flow2 +; GFX908-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[4:5] +; GFX908-NEXT: s_cbranch_scc0 .LBB16_8 +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB16_8: ; %atomicrmw.phi ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -3282,11 +3355,11 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -3305,11 +3378,11 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3375,10 +3448,10 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -3388,50 +3461,53 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_8 -; GFX90A-NEXT: .LBB17_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB17_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB17_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB17_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB17_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB17_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB17_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: .LBB17_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB17_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB17_2 -; GFX90A-NEXT: .LBB17_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB17_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB17_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB17_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3442,50 +3518,53 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_3 -; GFX908-NEXT: ; %bb.1: ; %Flow2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_8 -; GFX908-NEXT: .LBB17_2: ; %atomicrmw.phi -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; GFX908-NEXT: .LBB17_3: ; %atomicrmw.check.private +; GFX908-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB17_6 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB17_5 -; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX908-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: .LBB17_5: ; %Flow -; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB17_7 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB17_3: ; %Flow +; GFX908-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX908-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[6:7] +; GFX908-NEXT: s_cbranch_scc0 .LBB17_5 +; GFX908-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX908-NEXT: .LBB17_7: ; %Flow1 -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: .LBB17_5: ; %Flow1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB17_2 -; GFX908-NEXT: .LBB17_8: ; %atomicrmw.shared +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB17_6: ; %Flow2 +; GFX908-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[4:5] +; GFX908-NEXT: s_cbranch_scc0 .LBB17_8 +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB17_8: ; %atomicrmw.phi ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -3505,11 +3584,11 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -3528,11 +3607,11 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 -512 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3592,10 +3671,10 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -3605,25 +3684,20 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_8 -; GFX90A-NEXT: .LBB18_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB18_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB18_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB18_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3631,28 +3705,36 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB18_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB18_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB18_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB18_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 ; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GFX90A-NEXT: .LBB18_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB18_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB18_2 -; GFX90A-NEXT: .LBB18_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB18_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB18_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc ; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB18_8: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3671,10 +3753,10 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -3695,10 +3777,10 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -3718,10 +3800,10 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3782,10 +3864,10 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -3795,23 +3877,18 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_8 -; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB19_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB19_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -3819,28 +3896,36 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB19_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB19_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB19_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB19_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: .LBB19_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB19_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB19_2 -; GFX90A-NEXT: .LBB19_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB19_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB19_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB19_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3851,50 +3936,53 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_3 -; GFX908-NEXT: ; %bb.1: ; %Flow2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_8 -; GFX908-NEXT: .LBB19_2: ; %atomicrmw.phi -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; GFX908-NEXT: .LBB19_3: ; %atomicrmw.check.private +; GFX908-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB19_6 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB19_5 -; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX908-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: .LBB19_5: ; %Flow -; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB19_7 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB19_3: ; %Flow +; GFX908-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX908-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[6:7] +; GFX908-NEXT: s_cbranch_scc0 .LBB19_5 +; GFX908-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX908-NEXT: .LBB19_7: ; %Flow1 -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: .LBB19_5: ; %Flow1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB19_2 -; GFX908-NEXT: .LBB19_8: ; %atomicrmw.shared +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB19_6: ; %Flow2 +; GFX908-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[4:5] +; GFX908-NEXT: s_cbranch_scc0 .LBB19_8 +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB19_8: ; %atomicrmw.phi ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -3914,11 +4002,11 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -3937,11 +4025,11 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -4001,10 +4089,10 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -4014,25 +4102,20 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_8 -; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB20_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5 +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB20_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB20_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4040,28 +4123,36 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB20_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB20_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB20_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB20_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2 ; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GFX90A-NEXT: .LBB20_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB20_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB20_2 -; GFX90A-NEXT: .LBB20_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB20_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB20_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc ; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB20_8: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4080,10 +4171,10 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4104,10 +4195,10 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -4127,10 +4218,10 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -4191,10 +4282,10 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -4204,23 +4295,18 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_8 -; GFX90A-NEXT: .LBB21_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB21_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB21_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4228,28 +4314,36 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB21_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB21_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB21_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB21_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: .LBB21_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB21_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB21_2 -; GFX90A-NEXT: .LBB21_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB21_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB21_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB21_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4260,50 +4354,53 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_3 -; GFX908-NEXT: ; %bb.1: ; %Flow2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_8 -; GFX908-NEXT: .LBB21_2: ; %atomicrmw.phi -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; GFX908-NEXT: .LBB21_3: ; %atomicrmw.check.private +; GFX908-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB21_6 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB21_5 -; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX908-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: .LBB21_5: ; %Flow -; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB21_7 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB21_3: ; %Flow +; GFX908-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX908-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[6:7] +; GFX908-NEXT: s_cbranch_scc0 .LBB21_5 +; GFX908-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX908-NEXT: .LBB21_7: ; %Flow1 -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: .LBB21_5: ; %Flow1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB21_2 -; GFX908-NEXT: .LBB21_8: ; %atomicrmw.shared +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB21_6: ; %Flow2 +; GFX908-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[4:5] +; GFX908-NEXT: s_cbranch_scc0 .LBB21_8 +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB21_8: ; %atomicrmw.phi ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4323,11 +4420,11 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -4346,11 +4443,11 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -4399,11 +4496,11 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4424,10 +4521,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4446,10 +4543,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4468,10 +4565,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4490,10 +4587,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4512,10 +4609,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -4563,11 +4660,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4587,10 +4684,10 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4607,11 +4704,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4628,11 +4725,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4649,11 +4746,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4670,11 +4767,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret void @@ -4722,11 +4819,11 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4747,10 +4844,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4769,10 +4866,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4791,10 +4888,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4813,10 +4910,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4835,10 +4932,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -4886,11 +4983,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -4910,10 +5007,10 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -4930,11 +5027,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -4951,11 +5048,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -4972,11 +5069,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -4993,11 +5090,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void @@ -5054,10 +5151,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5066,26 +5163,31 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB26_6 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB26_6 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB26_3 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB26_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: .LBB26_3: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB26_5 +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB26_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -5093,20 +5195,23 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: .LBB26_5: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: .LBB26_6: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB26_8 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB26_8 ; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB26_8: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB26_8: ; %atomicrmw.end ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5126,10 +5231,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5148,10 +5253,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5170,10 +5275,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -5232,10 +5337,10 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -5243,50 +5348,53 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_8 -; GFX90A-NEXT: .LBB27_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB27_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB27_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB27_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB27_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB27_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB27_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB27_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: .LBB27_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB27_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB27_2 -; GFX90A-NEXT: .LBB27_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB27_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB27_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB27_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5295,50 +5403,53 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_3 -; GFX908-NEXT: ; %bb.1: ; %Flow2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_8 -; GFX908-NEXT: .LBB27_2: ; %atomicrmw.phi -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; GFX908-NEXT: .LBB27_3: ; %atomicrmw.check.private +; GFX908-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB27_6 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB27_5 -; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX908-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: .LBB27_5: ; %Flow -; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB27_7 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB27_3: ; %Flow +; GFX908-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX908-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[6:7] +; GFX908-NEXT: s_cbranch_scc0 .LBB27_5 +; GFX908-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX908-NEXT: .LBB27_7: ; %Flow1 -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: .LBB27_5: ; %Flow1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB27_2 -; GFX908-NEXT: .LBB27_8: ; %atomicrmw.shared +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB27_6: ; %Flow2 +; GFX908-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[4:5] +; GFX908-NEXT: s_cbranch_scc0 .LBB27_8 +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB27_8: ; %atomicrmw.phi ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5356,11 +5467,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -5377,11 +5488,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret void @@ -5438,10 +5549,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5460,10 +5571,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5482,10 +5593,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5504,10 +5615,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5526,10 +5637,10 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -5588,10 +5699,10 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: @@ -5608,11 +5719,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: @@ -5629,11 +5740,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: @@ -5650,11 +5761,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: @@ -5671,11 +5782,11 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret void @@ -5708,11 +5819,11 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5743,11 +5854,11 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5769,10 +5880,10 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5801,10 +5912,10 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -5828,10 +5939,10 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5855,10 +5966,10 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5889,11 +6000,11 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5924,11 +6035,11 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5952,10 +6063,10 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5982,10 +6093,10 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -6011,10 +6122,10 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6038,10 +6149,10 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6071,11 +6182,11 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6115,11 +6226,11 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6142,10 +6253,10 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6178,10 +6289,10 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6205,10 +6316,10 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6232,10 +6343,10 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6264,11 +6375,11 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -6297,11 +6408,11 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -6322,10 +6433,10 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -6351,11 +6462,11 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -6376,11 +6487,11 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -6401,11 +6512,11 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -6433,11 +6544,11 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6466,11 +6577,11 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6493,10 +6604,10 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6522,11 +6633,11 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6549,11 +6660,11 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6576,11 +6687,11 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6609,11 +6720,11 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6649,11 +6760,11 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6676,10 +6787,10 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6711,11 +6822,11 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6738,11 +6849,11 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6765,11 +6876,11 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6817,11 +6928,11 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6830,32 +6941,32 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v4, v[0:1] +; GFX940-NEXT: flat_load_dword v5, v[0:1] ; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -6890,11 +7001,11 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6924,10 +7035,10 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6936,31 +7047,31 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -6968,31 +7079,31 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v5, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -7021,10 +7132,10 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -7057,10 +7168,10 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7106,11 +7217,11 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7118,35 +7229,35 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7182,11 +7293,11 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7217,10 +7328,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7230,31 +7341,31 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7263,31 +7374,31 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v5, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7317,10 +7428,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -7354,10 +7465,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7404,11 +7515,11 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7417,35 +7528,35 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7481,11 +7592,11 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7516,10 +7627,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7529,31 +7640,31 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7562,31 +7673,31 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v5, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7616,10 +7727,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -7653,10 +7764,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7701,11 +7812,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -7733,11 +7844,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -7771,11 +7882,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -7804,10 +7915,10 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -7834,11 +7945,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -7865,11 +7976,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -7897,11 +8008,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -7932,11 +8043,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -7979,11 +8090,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8013,11 +8124,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8052,11 +8163,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8086,10 +8197,10 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8117,11 +8228,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8149,11 +8260,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8182,11 +8293,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8218,11 +8329,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -8266,11 +8377,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8301,11 +8412,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8340,11 +8451,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8374,10 +8485,10 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8405,11 +8516,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8437,11 +8548,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8470,11 +8581,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8506,11 +8617,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 -1024 %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -8542,11 +8653,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8566,11 +8677,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8593,11 +8704,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8621,10 +8732,10 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8643,11 +8754,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8666,11 +8777,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8691,11 +8802,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8720,11 +8831,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 @@ -8757,11 +8868,11 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -8783,10 +8894,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -8811,11 +8922,11 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8840,10 +8951,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8863,10 +8974,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8887,10 +8998,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -8913,10 +9024,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8942,10 +9053,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 @@ -8991,11 +9102,11 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -9003,35 +9114,35 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9067,11 +9178,11 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9102,10 +9213,10 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9115,33 +9226,33 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9150,31 +9261,31 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v5, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9204,10 +9315,10 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -9241,10 +9352,10 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -9290,11 +9401,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9324,11 +9435,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9363,11 +9474,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9397,10 +9508,10 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9430,11 +9541,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9462,11 +9573,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9495,11 +9606,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9531,11 +9642,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9591,11 +9702,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -9634,10 +9745,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -9682,11 +9793,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9722,10 +9833,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9761,10 +9872,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9800,10 +9911,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -9840,10 +9951,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -9876,10 +9987,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -9934,11 +10045,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -9979,10 +10090,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10029,11 +10140,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10070,10 +10181,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10110,10 +10221,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10150,10 +10261,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -10191,10 +10302,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -10228,10 +10339,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -10287,11 +10398,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10333,10 +10444,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10383,11 +10494,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10424,10 +10535,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10464,10 +10575,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10504,10 +10615,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -10545,10 +10656,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -10582,10 +10693,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -10640,11 +10751,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10683,11 +10794,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10732,11 +10843,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10772,10 +10883,10 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10810,11 +10921,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10849,11 +10960,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10889,11 +11000,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10925,11 +11036,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -10982,11 +11093,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11026,11 +11137,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11075,11 +11186,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11115,10 +11226,10 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11153,11 +11264,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11192,11 +11303,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11232,11 +11343,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11268,11 +11379,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 -1024 %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11314,11 +11425,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -11350,10 +11461,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -11388,11 +11499,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11424,10 +11535,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11456,10 +11567,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11489,10 +11600,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -11523,10 +11634,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11552,10 +11663,10 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 @@ -11597,11 +11708,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -11631,11 +11742,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -11668,11 +11779,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -11703,10 +11814,10 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -11734,11 +11845,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -11766,11 +11877,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -11799,11 +11910,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -11828,11 +11939,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 @@ -11883,11 +11994,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -11924,11 +12035,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -11971,11 +12082,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -12010,10 +12121,10 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -12047,11 +12158,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -12085,11 +12196,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -12124,11 +12235,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -12159,11 +12270,11 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -12216,11 +12327,11 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -12261,10 +12372,10 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12311,11 +12422,11 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12352,10 +12463,10 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12394,10 +12505,10 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12434,10 +12545,10 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12475,10 +12586,10 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -12512,10 +12623,10 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -12570,11 +12681,11 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12613,11 +12724,11 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12662,11 +12773,11 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12702,10 +12813,10 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12742,11 +12853,11 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12781,11 +12892,11 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12821,11 +12932,11 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12857,11 +12968,11 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -12914,11 +13025,11 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12939,10 +13050,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12961,10 +13072,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12983,10 +13094,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -13007,10 +13118,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -13034,25 +13145,25 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -13102,11 +13213,11 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -13129,10 +13240,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13150,10 +13261,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -13172,10 +13283,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -13198,10 +13309,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13226,25 +13337,25 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -13302,11 +13413,11 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13328,10 +13439,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB58_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13353,10 +13464,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13378,10 +13489,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB58_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13403,10 +13514,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB58_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13431,25 +13542,25 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -13497,11 +13608,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -13521,10 +13632,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB59_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -13541,11 +13652,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -13562,11 +13673,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB59_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -13585,11 +13696,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB59_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -13612,25 +13723,25 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -13677,11 +13788,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13703,10 +13814,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB60_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13723,11 +13834,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13744,11 +13855,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB60_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13769,11 +13880,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB60_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13798,25 +13909,25 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -13871,11 +13982,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13897,10 +14008,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB61_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13921,11 +14032,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13946,11 +14057,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB61_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13971,11 +14082,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB61_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14000,25 +14111,25 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14067,11 +14178,11 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB62_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14094,10 +14205,10 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB62_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14117,10 +14228,10 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB62_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14139,10 +14250,10 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB62_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14165,10 +14276,10 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB62_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14193,25 +14304,25 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB62_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14259,11 +14370,11 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB63_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14285,10 +14396,10 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB63_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14307,11 +14418,11 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB63_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14328,11 +14439,11 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB63_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14353,11 +14464,11 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB63_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14382,25 +14493,25 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB63_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14449,11 +14560,11 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB64_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB64_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14474,10 +14585,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB64_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB64_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14496,10 +14607,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB64_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB64_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14518,10 +14629,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB64_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB64_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14542,10 +14653,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB64_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB64_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -14569,25 +14680,25 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB64_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB64_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -14636,11 +14747,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB65_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB65_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -14660,10 +14771,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB65_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB65_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -14680,11 +14791,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB65_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB65_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -14701,11 +14812,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB65_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB65_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -14724,11 +14835,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB65_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB65_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -14751,25 +14862,25 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB65_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB65_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void @@ -14817,11 +14928,11 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB66_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB66_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14842,10 +14953,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB66_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB66_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14864,10 +14975,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB66_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB66_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14886,10 +14997,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB66_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB66_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14910,10 +15021,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB66_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB66_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -14937,25 +15048,25 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB66_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB66_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -15004,11 +15115,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB67_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB67_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -15028,10 +15139,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB67_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB67_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -15048,11 +15159,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB67_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB67_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -15069,11 +15180,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB67_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB67_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -15092,11 +15203,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB67_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB67_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -15119,25 +15230,25 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB67_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB67_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret void @@ -15211,12 +15322,12 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB68_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB68_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -15253,10 +15364,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB68_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB68_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -15293,10 +15404,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB68_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB68_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15333,10 +15444,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB68_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB68_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -15374,10 +15485,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB68_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB68_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -15409,13 +15520,13 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB68_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB68_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -15487,12 +15598,12 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB69_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB69_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -15531,10 +15642,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB69_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB69_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15570,10 +15681,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB69_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15610,10 +15721,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB69_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB69_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -15653,10 +15764,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB69_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB69_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15689,13 +15800,13 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB69_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB69_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15775,12 +15886,12 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB70_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB70_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15818,10 +15929,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB70_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB70_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15861,10 +15972,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB70_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB70_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15904,10 +16015,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB70_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB70_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15946,10 +16057,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB70_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB70_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15982,13 +16093,13 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB70_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB70_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16058,12 +16169,12 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB71_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB71_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16099,10 +16210,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB71_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB71_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16137,11 +16248,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB71_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB71_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16176,11 +16287,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB71_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB71_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16216,11 +16327,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB71_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB71_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16251,13 +16362,13 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB71_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB71_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -16326,12 +16437,12 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB72_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB72_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16369,10 +16480,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB72_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB72_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16407,11 +16518,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB72_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB72_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16446,11 +16557,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB72_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB72_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16488,11 +16599,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB72_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB72_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16525,13 +16636,13 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB72_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB72_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16608,12 +16719,12 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB73_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB73_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16651,10 +16762,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB73_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB73_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16693,11 +16804,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB73_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB73_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16736,11 +16847,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB73_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB73_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16778,11 +16889,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB73_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB73_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16815,13 +16926,13 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB73_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB73_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16892,12 +17003,12 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB74_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB74_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -16936,10 +17047,10 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB74_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB74_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16977,10 +17088,10 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB74_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB74_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17017,10 +17128,10 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB74_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB74_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -17060,10 +17171,10 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB74_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB74_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17096,13 +17207,13 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB74_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB74_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -17172,12 +17283,12 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB75_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB75_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17215,10 +17326,10 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB75_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB75_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17255,11 +17366,11 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB75_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB75_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17294,11 +17405,11 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB75_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB75_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17336,11 +17447,11 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB75_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB75_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17373,13 +17484,13 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB75_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB75_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -17450,12 +17561,12 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB76_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB76_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -17492,10 +17603,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB76_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB76_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -17532,10 +17643,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB76_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB76_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17572,10 +17683,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB76_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB76_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -17613,10 +17724,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB76_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB76_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -17648,13 +17759,13 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB76_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB76_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -17725,12 +17836,12 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB77_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB77_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -17766,10 +17877,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB77_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB77_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -17804,11 +17915,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB77_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB77_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -17843,11 +17954,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB77_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB77_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -17883,11 +17994,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB77_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB77_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -17918,13 +18029,13 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB77_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB77_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void @@ -17994,12 +18105,12 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB78_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB78_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -18036,10 +18147,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB78_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB78_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18076,10 +18187,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB78_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -18116,10 +18227,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB78_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB78_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -18157,10 +18268,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB78_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB78_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18192,13 +18303,13 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB78_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB78_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -18269,12 +18380,12 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB79_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB79_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -18310,10 +18421,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB79_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB79_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -18348,11 +18459,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB79_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB79_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -18387,11 +18498,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB79_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB79_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -18427,11 +18538,11 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB79_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB79_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -18462,13 +18573,13 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB79_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB79_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index ea2427a3c420f9..6c4ed7b4ab6e6c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -45,10 +45,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -89,10 +89,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -113,10 +113,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -137,10 +137,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -188,10 +188,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -234,10 +234,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -258,10 +258,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -284,10 +284,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -345,10 +345,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -396,10 +396,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -423,10 +423,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -448,10 +448,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -500,11 +500,11 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -545,11 +545,11 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -568,11 +568,11 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -591,11 +591,11 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -641,11 +641,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -688,11 +688,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -711,11 +711,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -736,11 +736,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -795,11 +795,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -848,11 +848,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -875,11 +875,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -900,11 +900,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -954,10 +954,10 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1002,10 +1002,10 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1026,10 +1026,10 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1052,10 +1052,10 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1104,11 +1104,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1153,11 +1153,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1176,11 +1176,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1201,11 +1201,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1255,10 +1255,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1282,11 +1282,11 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1309,10 +1309,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1333,10 +1333,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1357,10 +1357,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1381,10 +1381,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1405,10 +1405,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -1448,10 +1448,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1492,10 +1492,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1516,10 +1516,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1540,10 +1540,10 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1595,10 +1595,10 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1639,10 +1639,10 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1663,10 +1663,10 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1687,10 +1687,10 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1738,10 +1738,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1784,10 +1784,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1808,10 +1808,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1834,10 +1834,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1895,10 +1895,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -1946,10 +1946,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -1973,10 +1973,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -1998,10 +1998,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2050,11 +2050,11 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2095,11 +2095,11 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2118,11 +2118,11 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2141,11 +2141,11 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2191,11 +2191,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2238,11 +2238,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2261,11 +2261,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2286,11 +2286,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2345,11 +2345,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2398,11 +2398,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2425,11 +2425,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2450,11 +2450,11 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2504,10 +2504,10 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2552,10 +2552,10 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2576,10 +2576,10 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2602,10 +2602,10 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2654,11 +2654,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2703,11 +2703,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2726,11 +2726,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2751,11 +2751,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2801,11 +2801,11 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2838,11 +2838,11 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2882,10 +2882,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -2911,10 +2911,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2955,11 +2955,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2992,11 +2992,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3038,10 +3038,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3069,10 +3069,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3114,11 +3114,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3160,11 +3160,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3213,10 +3213,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3242,10 +3242,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3287,11 +3287,11 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3323,11 +3323,11 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3366,11 +3366,11 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3393,11 +3393,11 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3436,11 +3436,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3472,11 +3472,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3517,11 +3517,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3546,11 +3546,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3592,11 +3592,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3635,11 +3635,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3687,11 +3687,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3716,11 +3716,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3762,11 +3762,11 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3799,11 +3799,11 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3827,10 +3827,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3852,10 +3852,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3878,10 +3878,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3907,10 +3907,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3936,10 +3936,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3972,11 +3972,11 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4009,11 +4009,11 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4053,10 +4053,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4082,10 +4082,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4144,11 +4144,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4180,10 +4180,10 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4221,11 +4221,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4257,10 +4257,10 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4291,10 +4291,10 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4325,10 +4325,10 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4360,10 +4360,10 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4396,10 +4396,10 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4447,11 +4447,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4485,10 +4485,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4527,11 +4527,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4564,10 +4564,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4599,10 +4599,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4634,10 +4634,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4670,10 +4670,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4707,10 +4707,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4759,11 +4759,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4798,10 +4798,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4840,11 +4840,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4877,10 +4877,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4912,10 +4912,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4947,10 +4947,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4983,10 +4983,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5020,10 +5020,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5071,11 +5071,11 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5105,11 +5105,11 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5146,11 +5146,11 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5181,10 +5181,10 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5213,11 +5213,11 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5246,11 +5246,11 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5280,11 +5280,11 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5315,11 +5315,11 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -5365,11 +5365,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5401,11 +5401,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5443,11 +5443,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5479,10 +5479,10 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5512,11 +5512,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5546,11 +5546,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5581,11 +5581,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5617,11 +5617,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -5668,11 +5668,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5705,11 +5705,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5747,11 +5747,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5783,10 +5783,10 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5816,11 +5816,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5850,11 +5850,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5885,11 +5885,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5921,11 +5921,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 -1024 %unused = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -5960,11 +5960,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5988,10 +5988,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -6018,11 +6018,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6049,10 +6049,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6074,10 +6074,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6100,10 +6100,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6128,10 +6128,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6157,10 +6157,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 @@ -6196,11 +6196,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6222,11 +6222,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6252,11 +6252,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6282,10 +6282,10 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6306,11 +6306,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6331,11 +6331,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6358,11 +6358,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6387,11 +6387,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 @@ -6438,11 +6438,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6476,10 +6476,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -6518,11 +6518,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6555,10 +6555,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6592,10 +6592,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6627,10 +6627,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6663,10 +6663,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6700,10 +6700,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -6752,11 +6752,11 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6788,11 +6788,11 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6830,11 +6830,11 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6866,10 +6866,10 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6901,11 +6901,11 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6935,11 +6935,11 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6970,11 +6970,11 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7006,11 +7006,11 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7066,11 +7066,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7109,10 +7109,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7157,11 +7157,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7197,10 +7197,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7236,10 +7236,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7275,10 +7275,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -7315,10 +7315,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -7352,10 +7352,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7410,11 +7410,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7455,10 +7455,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7505,11 +7505,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7546,10 +7546,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7586,10 +7586,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7626,10 +7626,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -7667,10 +7667,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -7705,10 +7705,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7764,11 +7764,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7810,10 +7810,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7860,11 +7860,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7901,10 +7901,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7941,10 +7941,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7981,10 +7981,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -8022,10 +8022,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8060,10 +8060,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8116,11 +8116,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8157,11 +8157,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8204,11 +8204,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8243,10 +8243,10 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8280,11 +8280,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8318,11 +8318,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8357,11 +8357,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8393,11 +8393,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -8449,11 +8449,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8492,11 +8492,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8541,11 +8541,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8581,10 +8581,10 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8619,11 +8619,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8658,11 +8658,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8698,11 +8698,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8735,11 +8735,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -8792,11 +8792,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8836,11 +8836,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8885,11 +8885,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8925,10 +8925,10 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8963,11 +8963,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9002,11 +9002,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9042,11 +9042,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9079,11 +9079,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 -1024 %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9125,11 +9125,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -9161,10 +9161,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -9199,11 +9199,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9235,10 +9235,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -9267,10 +9267,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9300,10 +9300,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -9334,10 +9334,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -9364,10 +9364,10 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 @@ -9409,11 +9409,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9443,11 +9443,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9480,11 +9480,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9515,10 +9515,10 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9546,11 +9546,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9578,11 +9578,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9611,11 +9611,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9641,11 +9641,11 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 @@ -9699,11 +9699,11 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -9744,10 +9744,10 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -9794,11 +9794,11 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9835,10 +9835,10 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9877,10 +9877,10 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9917,10 +9917,10 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -9958,10 +9958,10 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -9996,10 +9996,10 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -10054,11 +10054,11 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10097,11 +10097,11 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10146,11 +10146,11 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10186,10 +10186,10 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10226,11 +10226,11 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10265,11 +10265,11 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10305,11 +10305,11 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10342,11 +10342,11 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -10382,11 +10382,11 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10409,10 +10409,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10436,11 +10436,11 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10463,10 +10463,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10487,10 +10487,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10511,10 +10511,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -10539,10 +10539,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -10566,25 +10566,25 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -10617,11 +10617,11 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10644,10 +10644,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10671,11 +10671,11 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10700,10 +10700,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10723,10 +10723,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10747,10 +10747,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -10777,10 +10777,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10805,25 +10805,25 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -10855,11 +10855,11 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10890,10 +10890,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10922,11 +10922,11 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10950,10 +10950,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10977,10 +10977,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11004,10 +11004,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11033,10 +11033,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11061,25 +11061,25 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11111,11 +11111,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11136,11 +11136,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11163,11 +11163,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11189,10 +11189,10 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11211,11 +11211,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11234,11 +11234,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11261,11 +11261,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11288,25 +11288,25 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -11337,11 +11337,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11362,11 +11362,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11389,11 +11389,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11417,10 +11417,10 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11439,11 +11439,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11462,11 +11462,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11491,11 +11491,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11520,25 +11520,25 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11570,11 +11570,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11601,11 +11601,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11632,11 +11632,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11660,10 +11660,10 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11686,11 +11686,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11713,11 +11713,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11742,11 +11742,11 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11771,25 +11771,25 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 %unused = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11821,11 +11821,11 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -11848,10 +11848,10 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -11875,11 +11875,11 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11904,10 +11904,10 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11929,10 +11929,10 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11953,10 +11953,10 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -11983,10 +11983,10 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12011,25 +12011,25 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -12061,11 +12061,11 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12086,11 +12086,11 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12113,11 +12113,11 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12141,10 +12141,10 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12165,11 +12165,11 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12188,11 +12188,11 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12217,11 +12217,11 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12246,25 +12246,25 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -12318,11 +12318,11 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -12361,10 +12361,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12408,12 +12408,12 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12450,10 +12450,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12490,10 +12490,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12530,10 +12530,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12571,10 +12571,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -12606,13 +12606,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -12663,11 +12663,11 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -12706,10 +12706,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12753,12 +12753,12 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12797,10 +12797,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12836,10 +12836,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12876,10 +12876,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12919,10 +12919,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12955,13 +12955,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -13011,11 +13011,11 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -13062,10 +13062,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13114,12 +13114,12 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13157,10 +13157,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13200,10 +13200,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13243,10 +13243,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13285,10 +13285,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13321,13 +13321,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -13376,11 +13376,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13417,11 +13417,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13463,12 +13463,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13504,10 +13504,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13542,11 +13542,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13581,11 +13581,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13621,48 +13621,48 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -13710,11 +13710,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13751,11 +13751,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB58_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13797,12 +13797,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13840,10 +13840,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB58_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13878,11 +13878,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13917,11 +13917,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB58_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13959,11 +13959,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB58_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13971,38 +13971,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14051,11 +14051,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14098,11 +14098,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB59_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14148,12 +14148,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14191,10 +14191,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB59_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14233,11 +14233,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14276,11 +14276,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB59_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14318,11 +14318,11 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB59_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14330,38 +14330,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14411,11 +14411,11 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -14454,10 +14454,10 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB60_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -14501,12 +14501,12 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14545,10 +14545,10 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB60_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14586,10 +14586,10 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14626,10 +14626,10 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB60_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14669,10 +14669,10 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB60_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14705,13 +14705,13 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14760,11 +14760,11 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14801,11 +14801,11 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB61_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14847,12 +14847,12 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14890,10 +14890,10 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB61_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14930,11 +14930,11 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14969,11 +14969,11 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB61_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15011,11 +15011,11 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB61_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15023,38 +15023,38 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 2767b66e447030..370860776d881d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -45,10 +45,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -89,10 +89,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -113,10 +113,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -137,10 +137,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -188,10 +188,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -234,10 +234,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -258,10 +258,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -284,10 +284,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -345,10 +345,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -396,10 +396,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -423,10 +423,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -448,10 +448,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -500,11 +500,11 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -545,11 +545,11 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -568,11 +568,11 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -591,11 +591,11 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -641,11 +641,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -688,11 +688,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -711,11 +711,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -736,11 +736,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -795,11 +795,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -848,11 +848,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -875,11 +875,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -900,11 +900,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -954,10 +954,10 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1002,10 +1002,10 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1026,10 +1026,10 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1052,10 +1052,10 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1104,11 +1104,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1153,11 +1153,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1176,11 +1176,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1201,11 +1201,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1255,10 +1255,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1282,11 +1282,11 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1309,10 +1309,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1333,10 +1333,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1357,10 +1357,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1381,10 +1381,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1405,10 +1405,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -1448,10 +1448,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1492,10 +1492,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1516,10 +1516,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1540,10 +1540,10 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1595,10 +1595,10 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1639,10 +1639,10 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1663,10 +1663,10 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1687,10 +1687,10 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1738,10 +1738,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1784,10 +1784,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1808,10 +1808,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1834,10 +1834,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1895,10 +1895,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -1946,10 +1946,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -1973,10 +1973,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -1998,10 +1998,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2050,11 +2050,11 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2095,11 +2095,11 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2118,11 +2118,11 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2141,11 +2141,11 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2191,11 +2191,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2238,11 +2238,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2261,11 +2261,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2286,11 +2286,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2345,11 +2345,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2398,11 +2398,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2425,11 +2425,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2450,11 +2450,11 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2504,10 +2504,10 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2552,10 +2552,10 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2576,10 +2576,10 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2602,10 +2602,10 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2654,11 +2654,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2703,11 +2703,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2726,11 +2726,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2751,11 +2751,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2801,11 +2801,11 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2838,11 +2838,11 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2882,10 +2882,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -2911,10 +2911,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2955,11 +2955,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2992,11 +2992,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3038,10 +3038,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3069,10 +3069,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3114,11 +3114,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3160,11 +3160,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3213,10 +3213,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3242,10 +3242,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3287,11 +3287,11 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3323,11 +3323,11 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3366,11 +3366,11 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3393,11 +3393,11 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3436,11 +3436,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3472,11 +3472,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3517,11 +3517,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3546,11 +3546,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3592,11 +3592,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3635,11 +3635,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3687,11 +3687,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3716,11 +3716,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3762,11 +3762,11 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3799,11 +3799,11 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3827,10 +3827,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3852,10 +3852,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3878,10 +3878,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3907,10 +3907,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3936,10 +3936,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3972,11 +3972,11 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4009,11 +4009,11 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4053,10 +4053,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4082,10 +4082,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4144,11 +4144,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4180,10 +4180,10 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4221,11 +4221,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4257,10 +4257,10 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4291,10 +4291,10 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4325,10 +4325,10 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4360,10 +4360,10 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4396,10 +4396,10 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4447,11 +4447,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4485,10 +4485,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4527,11 +4527,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4564,10 +4564,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4599,10 +4599,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4634,10 +4634,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4670,10 +4670,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4707,10 +4707,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4759,11 +4759,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4798,10 +4798,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4840,11 +4840,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4877,10 +4877,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4912,10 +4912,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4947,10 +4947,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4983,10 +4983,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5020,10 +5020,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5071,11 +5071,11 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5105,11 +5105,11 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5146,11 +5146,11 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5181,10 +5181,10 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5213,11 +5213,11 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5246,11 +5246,11 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5280,11 +5280,11 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5315,11 +5315,11 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -5365,11 +5365,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5401,11 +5401,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5443,11 +5443,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5479,10 +5479,10 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5512,11 +5512,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5546,11 +5546,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5581,11 +5581,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5617,11 +5617,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -5668,11 +5668,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5705,11 +5705,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5747,11 +5747,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5783,10 +5783,10 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5816,11 +5816,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5850,11 +5850,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5885,11 +5885,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5921,11 +5921,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 -1024 %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -5960,11 +5960,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5988,10 +5988,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -6018,11 +6018,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6049,10 +6049,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6074,10 +6074,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6100,10 +6100,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6128,10 +6128,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6157,10 +6157,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 @@ -6196,11 +6196,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6222,11 +6222,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6252,11 +6252,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6282,10 +6282,10 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6306,11 +6306,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6331,11 +6331,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6358,11 +6358,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6387,11 +6387,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 @@ -6438,11 +6438,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6476,10 +6476,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -6518,11 +6518,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6555,10 +6555,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6592,10 +6592,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6627,10 +6627,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6663,10 +6663,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6700,10 +6700,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -6752,11 +6752,11 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6788,11 +6788,11 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6830,11 +6830,11 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6866,10 +6866,10 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6901,11 +6901,11 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6935,11 +6935,11 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6970,11 +6970,11 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7006,11 +7006,11 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7066,11 +7066,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7109,10 +7109,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7157,11 +7157,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7197,10 +7197,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7236,10 +7236,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7275,10 +7275,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -7315,10 +7315,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -7352,10 +7352,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7410,11 +7410,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7455,10 +7455,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7505,11 +7505,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7546,10 +7546,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7586,10 +7586,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7626,10 +7626,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -7667,10 +7667,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -7705,10 +7705,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7764,11 +7764,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7810,10 +7810,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7860,11 +7860,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7901,10 +7901,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7941,10 +7941,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7981,10 +7981,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -8022,10 +8022,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8060,10 +8060,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8116,11 +8116,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8157,11 +8157,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8204,11 +8204,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8243,10 +8243,10 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8280,11 +8280,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8318,11 +8318,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8357,11 +8357,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -8393,11 +8393,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -8449,11 +8449,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8492,11 +8492,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8541,11 +8541,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8581,10 +8581,10 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8619,11 +8619,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8658,11 +8658,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8698,11 +8698,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8735,11 +8735,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -8792,11 +8792,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8836,11 +8836,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8885,11 +8885,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8925,10 +8925,10 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8963,11 +8963,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9002,11 +9002,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9042,11 +9042,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9079,11 +9079,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 -1024 %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9125,11 +9125,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -9161,10 +9161,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -9199,11 +9199,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9235,10 +9235,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -9267,10 +9267,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9300,10 +9300,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -9334,10 +9334,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -9364,10 +9364,10 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 @@ -9409,11 +9409,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9443,11 +9443,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9480,11 +9480,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9515,10 +9515,10 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9546,11 +9546,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9578,11 +9578,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9611,11 +9611,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -9641,11 +9641,11 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 @@ -9699,11 +9699,11 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -9744,10 +9744,10 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -9794,11 +9794,11 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9835,10 +9835,10 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9877,10 +9877,10 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9917,10 +9917,10 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -9958,10 +9958,10 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -9996,10 +9996,10 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -10054,11 +10054,11 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10097,11 +10097,11 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10146,11 +10146,11 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10186,10 +10186,10 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10226,11 +10226,11 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10265,11 +10265,11 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10305,11 +10305,11 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10342,11 +10342,11 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -10382,11 +10382,11 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10409,10 +10409,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10436,11 +10436,11 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10463,10 +10463,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10487,10 +10487,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10511,10 +10511,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -10539,10 +10539,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -10566,25 +10566,25 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -10617,11 +10617,11 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10644,10 +10644,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10671,11 +10671,11 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10700,10 +10700,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10723,10 +10723,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10747,10 +10747,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -10777,10 +10777,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10805,25 +10805,25 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -10855,11 +10855,11 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10890,10 +10890,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10922,11 +10922,11 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10950,10 +10950,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10977,10 +10977,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11004,10 +11004,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11033,10 +11033,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11061,25 +11061,25 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11111,11 +11111,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11136,11 +11136,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11163,11 +11163,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11189,10 +11189,10 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11211,11 +11211,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11234,11 +11234,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11261,11 +11261,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -11288,25 +11288,25 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -11337,11 +11337,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11362,11 +11362,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11389,11 +11389,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11417,10 +11417,10 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11439,11 +11439,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11462,11 +11462,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11491,11 +11491,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11520,25 +11520,25 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11570,11 +11570,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11601,11 +11601,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11632,11 +11632,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11660,10 +11660,10 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11686,11 +11686,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11713,11 +11713,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11742,11 +11742,11 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11771,25 +11771,25 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11821,11 +11821,11 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -11848,10 +11848,10 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -11875,11 +11875,11 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11904,10 +11904,10 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11929,10 +11929,10 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11953,10 +11953,10 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -11983,10 +11983,10 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12011,25 +12011,25 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -12061,11 +12061,11 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12086,11 +12086,11 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12113,11 +12113,11 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12141,10 +12141,10 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12165,11 +12165,11 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12188,11 +12188,11 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12217,11 +12217,11 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12246,25 +12246,25 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -12318,11 +12318,11 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -12361,10 +12361,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12408,12 +12408,12 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12450,10 +12450,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12490,10 +12490,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12530,10 +12530,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12571,10 +12571,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -12606,13 +12606,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -12663,11 +12663,11 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -12706,10 +12706,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12753,12 +12753,12 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12797,10 +12797,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12836,10 +12836,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12876,10 +12876,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12919,10 +12919,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12955,13 +12955,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -13011,11 +13011,11 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -13062,10 +13062,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13114,12 +13114,12 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13157,10 +13157,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13200,10 +13200,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13243,10 +13243,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13285,10 +13285,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13321,13 +13321,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -13376,11 +13376,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13417,11 +13417,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13463,12 +13463,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13504,10 +13504,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13542,11 +13542,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13581,11 +13581,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -13621,48 +13621,48 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -13710,11 +13710,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13751,11 +13751,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB58_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13797,12 +13797,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13840,10 +13840,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB58_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13878,11 +13878,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13917,11 +13917,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB58_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13959,11 +13959,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB58_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13971,38 +13971,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14051,11 +14051,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14098,11 +14098,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB59_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14148,12 +14148,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14191,10 +14191,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB59_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14233,11 +14233,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14276,11 +14276,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB59_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14318,11 +14318,11 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB59_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14330,38 +14330,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14411,11 +14411,11 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -14454,10 +14454,10 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB60_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -14501,12 +14501,12 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14545,10 +14545,10 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB60_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14586,10 +14586,10 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14626,10 +14626,10 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB60_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14669,10 +14669,10 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB60_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14705,13 +14705,13 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14760,11 +14760,11 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14801,11 +14801,11 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB61_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14847,12 +14847,12 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14890,10 +14890,10 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB61_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14930,11 +14930,11 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14969,11 +14969,11 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB61_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15011,11 +15011,11 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB61_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15023,38 +15023,38 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 6672f16c4a7a8d..9ed79b6c1b2925 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -35,11 +35,11 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -59,10 +59,10 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -84,11 +84,11 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -109,10 +109,10 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -131,10 +131,10 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -153,10 +153,10 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -175,10 +175,10 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -197,10 +197,10 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr %ptr, float %val syncscope("agent") seq_cst @@ -230,11 +230,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB1_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -254,10 +254,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -279,11 +279,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB1_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -306,10 +306,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: @@ -327,10 +327,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -349,10 +349,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -373,10 +373,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: @@ -396,10 +396,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst @@ -429,11 +429,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB2_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -461,10 +461,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -491,11 +491,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB2_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -517,10 +517,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -542,10 +542,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -567,10 +567,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -590,10 +590,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -613,10 +613,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst @@ -645,11 +645,11 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32: @@ -667,11 +667,11 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32: @@ -691,11 +691,11 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32: @@ -715,10 +715,10 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32: @@ -735,11 +735,11 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32: @@ -756,11 +756,11 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32: @@ -777,11 +777,11 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32: @@ -798,11 +798,11 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr %ptr, float %val syncscope("agent") seq_cst ret void @@ -830,11 +830,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -852,11 +852,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -876,11 +876,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -902,10 +902,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -922,11 +922,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -943,11 +943,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -966,11 +966,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -989,11 +989,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst @@ -1022,11 +1022,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1050,11 +1050,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1078,11 +1078,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1104,10 +1104,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1128,11 +1128,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1153,11 +1153,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1176,11 +1176,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1199,11 +1199,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 -512 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst @@ -1233,11 +1233,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -1257,10 +1257,10 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1282,11 +1282,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1309,10 +1309,10 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: @@ -1332,10 +1332,10 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1354,10 +1354,10 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1378,10 +1378,10 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: @@ -1401,10 +1401,10 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, float %val seq_cst @@ -1433,11 +1433,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1455,11 +1455,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1479,11 +1479,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1505,10 +1505,10 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1527,11 +1527,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1548,11 +1548,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1571,11 +1571,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1594,11 +1594,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, float %val seq_cst @@ -1632,11 +1632,11 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -1656,10 +1656,10 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1681,11 +1681,11 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1706,10 +1706,10 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1728,10 +1728,10 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1750,10 +1750,10 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1772,10 +1772,10 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1794,10 +1794,10 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr %ptr, float %val syncscope("agent") seq_cst @@ -1827,11 +1827,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -1851,10 +1851,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1876,11 +1876,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1903,10 +1903,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -1924,10 +1924,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1946,10 +1946,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1970,10 +1970,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -1993,10 +1993,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst @@ -2026,11 +2026,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2058,10 +2058,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2088,11 +2088,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2114,10 +2114,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2139,10 +2139,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2164,10 +2164,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2187,10 +2187,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2210,10 +2210,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst @@ -2242,11 +2242,11 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: @@ -2264,11 +2264,11 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: @@ -2288,11 +2288,11 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: @@ -2312,10 +2312,10 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: @@ -2332,11 +2332,11 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: @@ -2353,11 +2353,11 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: @@ -2374,11 +2374,11 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: @@ -2395,11 +2395,11 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr %ptr, float %val syncscope("agent") seq_cst ret void @@ -2427,11 +2427,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2449,11 +2449,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2473,11 +2473,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2499,10 +2499,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2519,11 +2519,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2540,11 +2540,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2563,11 +2563,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2586,11 +2586,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst @@ -2619,11 +2619,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -2647,11 +2647,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -2675,11 +2675,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -2701,10 +2701,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -2725,11 +2725,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -2750,11 +2750,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -2773,11 +2773,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -2796,11 +2796,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 -512 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst @@ -2830,11 +2830,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2854,10 +2854,10 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2879,11 +2879,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2906,10 +2906,10 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -2929,10 +2929,10 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2951,10 +2951,10 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2975,10 +2975,10 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -2998,10 +2998,10 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, float %val seq_cst @@ -3030,11 +3030,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3052,11 +3052,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3076,11 +3076,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3102,10 +3102,10 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3124,11 +3124,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3145,11 +3145,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3168,11 +3168,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3191,11 +3191,11 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, float %val seq_cst @@ -3229,11 +3229,11 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3253,10 +3253,10 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3279,11 +3279,11 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3305,10 +3305,10 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3328,10 +3328,10 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3352,10 +3352,10 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3379,10 +3379,10 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3406,10 +3406,10 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3440,11 +3440,11 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3464,10 +3464,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3490,11 +3490,11 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3518,10 +3518,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: @@ -3539,10 +3539,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3563,10 +3563,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3592,10 +3592,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: @@ -3619,10 +3619,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %result = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst @@ -3652,11 +3652,11 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3684,10 +3684,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -3714,11 +3714,11 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -3741,10 +3741,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -3766,10 +3766,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -3792,10 +3792,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -3819,10 +3819,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -3846,10 +3846,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %result = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst @@ -3878,11 +3878,11 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64: @@ -3900,11 +3900,11 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64: @@ -3924,11 +3924,11 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f64: @@ -3949,10 +3949,10 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64: @@ -3969,11 +3969,11 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64: @@ -3991,11 +3991,11 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64: @@ -4016,11 +4016,11 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64: @@ -4041,11 +4041,11 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr %ptr, double %val syncscope("agent") seq_cst ret void @@ -4073,11 +4073,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4095,11 +4095,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4119,11 +4119,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4146,10 +4146,10 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4166,11 +4166,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4188,11 +4188,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4215,11 +4215,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4242,11 +4242,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %unused = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst @@ -4275,11 +4275,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -4303,11 +4303,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -4331,11 +4331,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -4358,10 +4358,10 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -4382,11 +4382,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -4408,11 +4408,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -4435,11 +4435,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -4462,11 +4462,11 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %unused = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst @@ -4514,11 +4514,11 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4527,32 +4527,32 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v4, v[0:1] +; GFX940-NEXT: flat_load_dword v5, v[0:1] ; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16: @@ -4587,11 +4587,11 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4621,10 +4621,10 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4633,31 +4633,31 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_f16: @@ -4665,31 +4665,31 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v5, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_f16: @@ -4718,10 +4718,10 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4754,10 +4754,10 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4803,11 +4803,11 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4815,35 +4815,35 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -4879,11 +4879,11 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4914,10 +4914,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4927,31 +4927,31 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -4960,31 +4960,31 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v5, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -5014,10 +5014,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5051,10 +5051,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5101,11 +5101,11 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5114,35 +5114,35 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -5178,11 +5178,11 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5213,10 +5213,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5226,31 +5226,31 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -5259,31 +5259,31 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v5, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -5313,10 +5313,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5350,10 +5350,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5398,11 +5398,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16: @@ -5430,11 +5430,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16: @@ -5468,11 +5468,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16: @@ -5501,10 +5501,10 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f16: @@ -5531,11 +5531,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f16: @@ -5562,11 +5562,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f16: @@ -5594,11 +5594,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f16: @@ -5629,11 +5629,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr %ptr, half %val syncscope("agent") seq_cst ret void @@ -5676,11 +5676,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -5710,11 +5710,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -5749,11 +5749,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -5783,10 +5783,10 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -5814,11 +5814,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -5846,11 +5846,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -5879,11 +5879,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -5915,11 +5915,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst @@ -5963,11 +5963,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -5998,11 +5998,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -6037,11 +6037,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -6071,10 +6071,10 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -6102,11 +6102,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -6134,11 +6134,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -6167,11 +6167,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -6203,11 +6203,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 -1024 %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst @@ -6240,11 +6240,11 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6266,10 +6266,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -6294,11 +6294,11 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6323,10 +6323,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -6346,10 +6346,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6370,10 +6370,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6396,10 +6396,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -6425,10 +6425,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 @@ -6461,11 +6461,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -6485,11 +6485,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -6512,11 +6512,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -6540,10 +6540,10 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -6562,11 +6562,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -6585,11 +6585,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -6610,11 +6610,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -6639,11 +6639,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst, align 4 @@ -6688,11 +6688,11 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6700,35 +6700,35 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: @@ -6764,11 +6764,11 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6799,10 +6799,10 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6812,33 +6812,33 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: @@ -6847,31 +6847,31 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: flat_load_dword v5, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: @@ -6901,10 +6901,10 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6938,10 +6938,10 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -6987,11 +6987,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: @@ -7021,11 +7021,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: @@ -7060,11 +7060,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: @@ -7094,10 +7094,10 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: @@ -7127,11 +7127,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: @@ -7159,11 +7159,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: @@ -7192,11 +7192,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: @@ -7228,11 +7228,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, half %val seq_cst @@ -7288,11 +7288,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7331,10 +7331,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7379,11 +7379,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7419,10 +7419,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7458,10 +7458,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7497,10 +7497,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -7537,10 +7537,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -7573,10 +7573,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7631,11 +7631,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7676,10 +7676,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7726,11 +7726,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7767,10 +7767,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7807,10 +7807,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7847,10 +7847,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -7888,10 +7888,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -7925,10 +7925,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7984,11 +7984,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -8030,10 +8030,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -8080,11 +8080,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8121,10 +8121,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8161,10 +8161,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8201,10 +8201,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -8242,10 +8242,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8279,10 +8279,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8335,11 +8335,11 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16: @@ -8376,11 +8376,11 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16: @@ -8423,11 +8423,11 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16: @@ -8462,10 +8462,10 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_bf16: @@ -8499,11 +8499,11 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_bf16: @@ -8537,11 +8537,11 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_bf16: @@ -8576,11 +8576,11 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_bf16: @@ -8611,11 +8611,11 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr %ptr, bfloat %val syncscope("agent") seq_cst ret void @@ -8667,11 +8667,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -8710,11 +8710,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -8759,11 +8759,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -8799,10 +8799,10 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -8837,11 +8837,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -8876,11 +8876,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -8916,11 +8916,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -8952,11 +8952,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst @@ -9009,11 +9009,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -9053,11 +9053,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -9102,11 +9102,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -9142,10 +9142,10 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -9180,11 +9180,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -9219,11 +9219,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -9259,11 +9259,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -9295,11 +9295,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 -1024 %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst @@ -9341,11 +9341,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -9377,10 +9377,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -9415,11 +9415,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9451,10 +9451,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -9483,10 +9483,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9516,10 +9516,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -9550,10 +9550,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -9579,10 +9579,10 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 @@ -9624,11 +9624,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -9658,11 +9658,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -9695,11 +9695,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -9730,10 +9730,10 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -9761,11 +9761,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -9793,11 +9793,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -9826,11 +9826,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -9855,11 +9855,11 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4 @@ -9913,11 +9913,11 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -9958,10 +9958,10 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10008,11 +10008,11 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10049,10 +10049,10 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10091,10 +10091,10 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10131,10 +10131,10 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -10172,10 +10172,10 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -10209,10 +10209,10 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -10267,11 +10267,11 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -10310,11 +10310,11 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -10359,11 +10359,11 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -10399,10 +10399,10 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -10439,11 +10439,11 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -10478,11 +10478,11 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -10518,11 +10518,11 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -10554,11 +10554,11 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, bfloat %val seq_cst @@ -10592,11 +10592,11 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10616,10 +10616,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10641,11 +10641,11 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10666,10 +10666,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10688,10 +10688,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10710,10 +10710,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -10734,10 +10734,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -10761,25 +10761,25 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -10810,11 +10810,11 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10834,10 +10834,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10859,11 +10859,11 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10886,10 +10886,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -10907,10 +10907,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10929,10 +10929,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -10955,10 +10955,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -10983,25 +10983,25 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -11031,11 +11031,11 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -11063,10 +11063,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -11093,11 +11093,11 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -11119,10 +11119,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -11144,10 +11144,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -11169,10 +11169,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -11194,10 +11194,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -11222,25 +11222,25 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -11269,11 +11269,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16: @@ -11291,11 +11291,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16: @@ -11315,11 +11315,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2f16: @@ -11339,10 +11339,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2f16: @@ -11359,11 +11359,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2f16: @@ -11380,11 +11380,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2f16: @@ -11403,11 +11403,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2f16: @@ -11430,25 +11430,25 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr %ptr, <2 x half> %val syncscope("agent") seq_cst ret void @@ -11476,11 +11476,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -11498,11 +11498,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -11522,11 +11522,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -11548,10 +11548,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -11568,11 +11568,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -11589,11 +11589,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -11614,11 +11614,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -11643,25 +11643,25 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -11690,11 +11690,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -11718,11 +11718,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -11746,11 +11746,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -11772,10 +11772,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -11796,11 +11796,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -11821,11 +11821,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -11846,11 +11846,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -11875,25 +11875,25 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 %unused = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -11923,11 +11923,11 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -11947,10 +11947,10 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -11972,11 +11972,11 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11999,10 +11999,10 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -12022,10 +12022,10 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12044,10 +12044,10 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12070,10 +12070,10 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -12098,25 +12098,25 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x half> %val seq_cst @@ -12145,11 +12145,11 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -12167,11 +12167,11 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -12191,11 +12191,11 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -12217,10 +12217,10 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -12239,11 +12239,11 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -12260,11 +12260,11 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -12285,11 +12285,11 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -12314,25 +12314,25 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, <2 x half> %val seq_cst @@ -12386,11 +12386,11 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -12429,10 +12429,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12476,12 +12476,12 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12518,10 +12518,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12558,10 +12558,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12598,10 +12598,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12639,10 +12639,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -12674,13 +12674,13 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -12731,11 +12731,11 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -12774,10 +12774,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12821,12 +12821,12 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12865,10 +12865,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -12904,10 +12904,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12944,10 +12944,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12987,10 +12987,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -13023,13 +13023,13 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst @@ -13079,11 +13079,11 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -13130,10 +13130,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -13182,12 +13182,12 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -13225,10 +13225,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -13268,10 +13268,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -13311,10 +13311,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -13353,10 +13353,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -13389,13 +13389,13 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst @@ -13444,11 +13444,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16: @@ -13485,11 +13485,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16: @@ -13531,12 +13531,12 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16: @@ -13572,10 +13572,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16: @@ -13610,11 +13610,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16: @@ -13649,11 +13649,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16: @@ -13689,11 +13689,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2bf16: @@ -13724,13 +13724,13 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret void @@ -13778,11 +13778,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -13819,11 +13819,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -13865,12 +13865,12 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -13908,10 +13908,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -13946,11 +13946,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -13985,11 +13985,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -14027,11 +14027,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -14064,13 +14064,13 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst @@ -14119,11 +14119,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -14166,11 +14166,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -14216,12 +14216,12 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -14259,10 +14259,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -14301,11 +14301,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -14344,11 +14344,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -14386,11 +14386,11 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -14423,13 +14423,13 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst @@ -14479,11 +14479,11 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -14522,10 +14522,10 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -14569,12 +14569,12 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14613,10 +14613,10 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -14654,10 +14654,10 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14694,10 +14694,10 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14737,10 +14737,10 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -14773,13 +14773,13 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val seq_cst @@ -14828,11 +14828,11 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -14869,11 +14869,11 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -14915,12 +14915,12 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -14958,10 +14958,10 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -14998,11 +14998,11 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -15037,11 +15037,11 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -15079,11 +15079,11 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -15116,13 +15116,13 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 9c2faf622623d6..486df200436aed 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -1773,11 +1773,11 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB50_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB50_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_noret: @@ -1795,11 +1795,11 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB50_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB50_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_noret: @@ -1817,11 +1817,11 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB50_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB50_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr %ptr, i32 %in seq_cst ret void @@ -1845,11 +1845,11 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB51_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB51_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset: @@ -1869,11 +1869,11 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB51_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB51_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset: @@ -1891,11 +1891,11 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB51_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB51_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst @@ -1919,10 +1919,10 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB52_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB52_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -1942,10 +1942,10 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB52_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB52_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -1965,10 +1965,10 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB52_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB52_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr %ptr, i32 %in seq_cst @@ -1994,10 +1994,10 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB53_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB53_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset: @@ -2018,10 +2018,10 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB53_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB53_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset: @@ -2040,10 +2040,10 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB53_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB53_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 @@ -2069,11 +2069,11 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB54_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB54_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_noret_scalar: @@ -2093,11 +2093,11 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB54_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB54_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_noret_scalar: @@ -2117,11 +2117,11 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB54_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB54_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr %ptr, i32 %in seq_cst ret void @@ -2147,11 +2147,11 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB55_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB55_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset_scalar: @@ -2173,11 +2173,11 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB55_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB55_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset_scalar: @@ -2197,11 +2197,11 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB55_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB55_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst @@ -2229,10 +2229,10 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB56_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB56_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_ret_scalar: @@ -2255,10 +2255,10 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB56_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB56_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_ret_scalar: @@ -2281,10 +2281,10 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB56_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB56_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2311,10 +2311,10 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB57_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB57_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset_scalar: @@ -2337,10 +2337,10 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB57_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB57_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset_scalar: @@ -2363,10 +2363,10 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB57_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB57_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw nand ptr %gep, i32 %in seq_cst @@ -2391,11 +2391,11 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB58_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB58_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory: @@ -2415,11 +2415,11 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB58_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB58_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory: @@ -2437,11 +2437,11 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB58_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB58_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2467,10 +2467,10 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB59_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB59_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory: @@ -2491,10 +2491,10 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB59_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB59_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory: @@ -2513,10 +2513,10 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB59_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB59_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 @@ -3243,11 +3243,11 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB80_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB80_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_noret: @@ -3264,11 +3264,11 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB80_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB80_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_noret: @@ -3285,11 +3285,11 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB80_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB80_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst ret void @@ -3312,11 +3312,11 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB81_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB81_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_noret_offset: @@ -3335,11 +3335,11 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB81_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB81_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_noret_offset: @@ -3356,11 +3356,11 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB81_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB81_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst @@ -3383,10 +3383,10 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB82_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB82_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -3405,10 +3405,10 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB82_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB82_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -3427,10 +3427,10 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB82_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB82_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw max ptr %ptr, i32 %in seq_cst @@ -3455,10 +3455,10 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB83_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB83_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_ret_offset: @@ -3478,10 +3478,10 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB83_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB83_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_ret_offset: @@ -3499,10 +3499,10 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB83_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB83_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 @@ -3527,11 +3527,11 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB84_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB84_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_noret_scalar: @@ -3550,11 +3550,11 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB84_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB84_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_noret_scalar: @@ -3573,11 +3573,11 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB84_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB84_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst ret void @@ -3602,11 +3602,11 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB85_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB85_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_noret_offset_scalar: @@ -3627,11 +3627,11 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB85_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB85_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_noret_offset_scalar: @@ -3650,11 +3650,11 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB85_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB85_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst @@ -3681,10 +3681,10 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB86_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB86_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_ret_scalar: @@ -3706,10 +3706,10 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB86_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB86_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_ret_scalar: @@ -3731,10 +3731,10 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB86_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB86_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw max ptr %ptr, i32 %in seq_cst ret i32 %result @@ -3760,10 +3760,10 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB87_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB87_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_ret_offset_scalar: @@ -3785,10 +3785,10 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB87_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB87_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_ret_offset_scalar: @@ -3810,10 +3810,10 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB87_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB87_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw max ptr %gep, i32 %in seq_cst @@ -3845,9 +3845,10 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB88_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB88_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -3875,9 +3876,10 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB88_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB88_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -3903,9 +3905,10 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB88_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB88_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -3942,10 +3945,10 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB89_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB89_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 @@ -3977,10 +3980,10 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB89_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB89_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 @@ -4010,10 +4013,10 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GCN3-NEXT: s_cbranch_execnz .LBB89_1 +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GCN3-NEXT: s_cbranch_scc1 .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4049,9 +4052,10 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB90_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB90_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4077,9 +4081,10 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB90_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB90_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4105,9 +4110,10 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB90_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB90_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -4141,10 +4147,10 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB91_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB91_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 @@ -4174,10 +4180,10 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB91_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB91_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 @@ -4207,10 +4213,10 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GCN3-NEXT: s_cbranch_execnz .LBB91_1 +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GCN3-NEXT: s_cbranch_scc1 .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4239,11 +4245,11 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB92_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB92_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory: @@ -4262,11 +4268,11 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB92_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB92_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory: @@ -4283,11 +4289,11 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB92_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB92_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -4312,10 +4318,10 @@ define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB93_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB93_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: @@ -4335,10 +4341,10 @@ define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB93_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB93_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: @@ -4356,10 +4362,10 @@ define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB93_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 @@ -4386,11 +4392,11 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB94_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_noret: @@ -4407,11 +4413,11 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB94_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_noret: @@ -4428,11 +4434,11 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB94_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst ret void @@ -4455,11 +4461,11 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB95_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB95_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_noret_offset: @@ -4478,11 +4484,11 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB95_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB95_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_noret_offset: @@ -4499,11 +4505,11 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB95_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB95_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst @@ -4526,10 +4532,10 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB96_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB96_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -4548,10 +4554,10 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB96_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB96_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -4570,10 +4576,10 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB96_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB96_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umax ptr %ptr, i32 %in seq_cst @@ -4598,10 +4604,10 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB97_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB97_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset: @@ -4621,10 +4627,10 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB97_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB97_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset: @@ -4642,10 +4648,10 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB97_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB97_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 @@ -4670,11 +4676,11 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB98_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB98_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_noret_scalar: @@ -4693,11 +4699,11 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB98_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB98_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_noret_scalar: @@ -4716,11 +4722,11 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB98_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB98_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst ret void @@ -4745,11 +4751,11 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB99_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB99_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_noret_offset_scalar: @@ -4770,11 +4776,11 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB99_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB99_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_noret_offset_scalar: @@ -4793,11 +4799,11 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB99_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB99_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst @@ -4824,10 +4830,10 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB100_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB100_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_ret_scalar: @@ -4849,10 +4855,10 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB100_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB100_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_ret_scalar: @@ -4874,10 +4880,10 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB100_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB100_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umax ptr %ptr, i32 %in seq_cst ret i32 %result @@ -4903,10 +4909,10 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB101_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB101_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset_scalar: @@ -4928,10 +4934,10 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB101_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB101_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset_scalar: @@ -4953,10 +4959,10 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB101_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB101_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw umax ptr %gep, i32 %in seq_cst @@ -4988,9 +4994,10 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB102_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB102_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -5018,9 +5025,10 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB102_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB102_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -5046,9 +5054,10 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB102_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB102_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -5085,10 +5094,10 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB103_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB103_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 @@ -5120,10 +5129,10 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB103_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB103_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 @@ -5153,10 +5162,10 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GCN3-NEXT: s_cbranch_execnz .LBB103_1 +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GCN3-NEXT: s_cbranch_scc1 .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -5194,10 +5203,10 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB104_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB104_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 @@ -5227,10 +5236,10 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB104_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB104_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 @@ -5260,10 +5269,10 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GCN3-NEXT: s_cbranch_execnz .LBB104_1 +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GCN3-NEXT: s_cbranch_scc1 .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -5292,11 +5301,11 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB105_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory: @@ -5315,11 +5324,11 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB105_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory: @@ -5336,11 +5345,11 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB105_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -5365,10 +5374,10 @@ define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB106_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: @@ -5388,10 +5397,10 @@ define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB106_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: @@ -5409,10 +5418,10 @@ define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB106_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB106_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 @@ -5439,11 +5448,11 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB107_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB107_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_noret: @@ -5460,11 +5469,11 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB107_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB107_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_noret: @@ -5481,11 +5490,11 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB107_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB107_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr %ptr, i32 %in seq_cst ret void @@ -5508,11 +5517,11 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB108_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB108_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_noret_offset: @@ -5531,11 +5540,11 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB108_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB108_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_noret_offset: @@ -5552,11 +5561,11 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB108_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB108_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst @@ -5579,10 +5588,10 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB109_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB109_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -5601,10 +5610,10 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB109_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB109_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -5623,10 +5632,10 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB109_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB109_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umin ptr %ptr, i32 %in seq_cst @@ -5651,10 +5660,10 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB110_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB110_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset: @@ -5674,10 +5683,10 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB110_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB110_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset: @@ -5695,10 +5704,10 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB110_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB110_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 @@ -5723,11 +5732,11 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB111_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB111_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_noret_scalar: @@ -5746,11 +5755,11 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB111_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB111_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_noret_scalar: @@ -5769,11 +5778,11 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB111_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB111_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr %ptr, i32 %in seq_cst ret void @@ -5798,11 +5807,11 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB112_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB112_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_noret_offset_scalar: @@ -5823,11 +5832,11 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB112_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB112_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_noret_offset_scalar: @@ -5846,11 +5855,11 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB112_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB112_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst @@ -5877,10 +5886,10 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB113_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB113_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_ret_scalar: @@ -5902,10 +5911,10 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB113_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB113_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_ret_scalar: @@ -5927,10 +5936,10 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB113_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB113_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umin ptr %ptr, i32 %in seq_cst ret i32 %result @@ -5956,10 +5965,10 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB114_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB114_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset_scalar: @@ -5981,10 +5990,10 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB114_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB114_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset_scalar: @@ -6006,10 +6015,10 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB114_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB114_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw umin ptr %gep, i32 %in seq_cst @@ -6033,11 +6042,11 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB115_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory: @@ -6056,11 +6065,11 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB115_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory: @@ -6077,11 +6086,11 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB115_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6106,10 +6115,10 @@ define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB116_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: @@ -6129,10 +6138,10 @@ define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB116_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: @@ -6150,10 +6159,10 @@ define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB116_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 @@ -6180,11 +6189,11 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB117_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB117_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_noret: @@ -6201,11 +6210,11 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB117_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB117_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_noret: @@ -6222,11 +6231,11 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB117_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB117_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst ret void @@ -6249,11 +6258,11 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB118_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB118_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_noret_offset: @@ -6272,11 +6281,11 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB118_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB118_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_noret_offset: @@ -6293,11 +6302,11 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB118_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB118_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst @@ -6320,10 +6329,10 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB119_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB119_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -6342,10 +6351,10 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB119_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB119_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -6364,10 +6373,10 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB119_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB119_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw min ptr %ptr, i32 %in seq_cst @@ -6392,10 +6401,10 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB120_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB120_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_ret_offset: @@ -6415,10 +6424,10 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB120_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB120_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_ret_offset: @@ -6436,10 +6445,10 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB120_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB120_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 @@ -6464,11 +6473,11 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB121_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB121_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_noret_scalar: @@ -6487,11 +6496,11 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB121_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB121_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_noret_scalar: @@ -6510,11 +6519,11 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB121_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB121_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst ret void @@ -6539,11 +6548,11 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB122_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB122_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_noret_offset_scalar: @@ -6564,11 +6573,11 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB122_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB122_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_noret_offset_scalar: @@ -6587,11 +6596,11 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB122_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB122_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst @@ -6618,10 +6627,10 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB123_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB123_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_ret_scalar: @@ -6643,10 +6652,10 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB123_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB123_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_ret_scalar: @@ -6668,10 +6677,10 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB123_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB123_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw min ptr %ptr, i32 %in seq_cst ret i32 %result @@ -6697,10 +6706,10 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB124_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB124_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_ret_offset_scalar: @@ -6722,10 +6731,10 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB124_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB124_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_ret_offset_scalar: @@ -6747,10 +6756,10 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB124_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB124_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw min ptr %gep, i32 %in seq_cst @@ -6782,9 +6791,10 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB125_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB125_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -6812,9 +6822,10 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB125_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB125_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -6840,9 +6851,10 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB125_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB125_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -6879,10 +6891,10 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB126_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB126_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 @@ -6914,10 +6926,10 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB126_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB126_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 @@ -6947,10 +6959,10 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GCN3-NEXT: s_cbranch_execnz .LBB126_1 +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GCN3-NEXT: s_cbranch_scc1 .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -6982,9 +6994,10 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB127_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB127_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -7006,9 +7019,10 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB127_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB127_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -7030,9 +7044,10 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB127_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -7065,10 +7080,10 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB128_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB128_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 @@ -7098,10 +7113,10 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB128_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB128_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 @@ -7131,10 +7146,10 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GCN3-NEXT: s_cbranch_execnz .LBB128_1 +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GCN3-NEXT: s_cbranch_scc1 .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -7163,11 +7178,11 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB129_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB129_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory: @@ -7186,11 +7201,11 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB129_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB129_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory: @@ -7207,11 +7222,11 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB129_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB129_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7236,10 +7251,10 @@ define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB130_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB130_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: @@ -7259,10 +7274,10 @@ define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB130_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB130_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: @@ -7280,10 +7295,10 @@ define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB130_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB130_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 3fd624b592cd4d..f8b42d73ee29bc 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -1839,11 +1839,11 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB50_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB50_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret: @@ -1867,11 +1867,11 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB50_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB50_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret: @@ -1892,11 +1892,11 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB50_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB50_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst ret void @@ -1926,11 +1926,11 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB51_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB51_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset: @@ -1956,11 +1956,11 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB51_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB51_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset: @@ -1981,11 +1981,11 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB51_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB51_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst @@ -2015,10 +2015,10 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB52_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB52_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2045,10 +2045,10 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB52_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB52_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2072,10 +2072,10 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB52_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB52_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2108,10 +2108,10 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB53_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB53_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset: @@ -2138,10 +2138,10 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB53_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB53_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset: @@ -2163,10 +2163,10 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB53_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB53_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2203,11 +2203,11 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB54_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB54_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret_scalar: @@ -2237,11 +2237,11 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB54_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB54_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret_scalar: @@ -2266,11 +2266,11 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB54_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB54_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst ret void @@ -2304,11 +2304,11 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB55_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB55_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset_scalar: @@ -2338,11 +2338,11 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB55_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB55_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset_scalar: @@ -2367,11 +2367,11 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB55_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB55_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst @@ -2407,10 +2407,10 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB56_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB56_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret_scalar: @@ -2441,10 +2441,10 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB56_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB56_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret_scalar: @@ -2470,10 +2470,10 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB56_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB56_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2508,10 +2508,10 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB57_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB57_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset_scalar: @@ -2542,10 +2542,10 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB57_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB57_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset_scalar: @@ -2571,10 +2571,10 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB57_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB57_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst @@ -2605,11 +2605,11 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB58_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB58_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: @@ -2635,11 +2635,11 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB58_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB58_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: @@ -2660,11 +2660,11 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB58_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB58_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2696,10 +2696,10 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB59_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB59_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: @@ -2726,10 +2726,10 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB59_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB59_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: @@ -2751,10 +2751,10 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB59_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB59_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3512,11 +3512,11 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB80_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB80_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret: @@ -3539,11 +3539,11 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB80_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB80_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret: @@ -3563,11 +3563,11 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB80_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB80_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst ret void @@ -3596,11 +3596,11 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB81_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB81_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret_offset: @@ -3625,11 +3625,11 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB81_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB81_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret_offset: @@ -3649,11 +3649,11 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB81_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB81_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst @@ -3682,10 +3682,10 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB82_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB82_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3711,10 +3711,10 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB82_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB82_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3737,10 +3737,10 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB82_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB82_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3772,10 +3772,10 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB83_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB83_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_ret_offset: @@ -3801,10 +3801,10 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB83_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB83_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_ret_offset: @@ -3825,10 +3825,10 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB83_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB83_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3866,11 +3866,11 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB84_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB84_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret_scalar: @@ -3901,11 +3901,11 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB84_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB84_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret_scalar: @@ -3931,11 +3931,11 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB84_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB84_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst ret void @@ -3970,11 +3970,11 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB85_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB85_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret_offset_scalar: @@ -4005,11 +4005,11 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB85_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB85_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret_offset_scalar: @@ -4035,11 +4035,11 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB85_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB85_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst @@ -4076,10 +4076,10 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB86_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB86_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_ret_scalar: @@ -4111,10 +4111,10 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB86_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB86_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_ret_scalar: @@ -4141,10 +4141,10 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB86_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB86_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw max ptr %ptr, i64 %in seq_cst ret i64 %result @@ -4180,10 +4180,10 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB87_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB87_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_ret_offset_scalar: @@ -4215,10 +4215,10 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB87_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB87_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_ret_offset_scalar: @@ -4245,10 +4245,10 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB87_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB87_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst @@ -4284,9 +4284,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB88_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB88_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4318,9 +4319,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB88_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB88_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4350,9 +4352,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB88_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB88_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -4391,10 +4394,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB89_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB89_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4428,10 +4431,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB89_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB89_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4463,10 +4466,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB89_1 +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4506,9 +4509,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB90_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB90_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4538,9 +4542,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB90_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB90_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4570,9 +4575,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB90_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB90_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -4608,10 +4614,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB91_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB91_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4643,10 +4649,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB91_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB91_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4678,10 +4684,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB91_1 +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4716,11 +4722,11 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB92_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB92_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: @@ -4745,11 +4751,11 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB92_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB92_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: @@ -4769,11 +4775,11 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB92_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB92_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -4804,10 +4810,10 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB93_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB93_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: @@ -4833,10 +4839,10 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB93_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB93_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: @@ -4857,10 +4863,10 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB93_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4894,11 +4900,11 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB94_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret: @@ -4921,11 +4927,11 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB94_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret: @@ -4945,11 +4951,11 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB94_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst ret void @@ -4978,11 +4984,11 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB95_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB95_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset: @@ -5007,11 +5013,11 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB95_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB95_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset: @@ -5031,11 +5037,11 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB95_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB95_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst @@ -5064,10 +5070,10 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB96_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB96_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5093,10 +5099,10 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB96_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB96_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5119,10 +5125,10 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB96_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB96_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5154,10 +5160,10 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB97_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB97_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset: @@ -5183,10 +5189,10 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB97_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB97_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset: @@ -5207,10 +5213,10 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB97_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB97_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5248,11 +5254,11 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB98_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB98_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret_scalar: @@ -5283,11 +5289,11 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB98_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB98_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret_scalar: @@ -5313,11 +5319,11 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB98_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB98_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst ret void @@ -5352,11 +5358,11 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB99_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB99_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset_scalar: @@ -5387,11 +5393,11 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB99_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB99_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset_scalar: @@ -5417,11 +5423,11 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB99_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB99_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst @@ -5458,10 +5464,10 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB100_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB100_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_ret_scalar: @@ -5493,10 +5499,10 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB100_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB100_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_ret_scalar: @@ -5523,10 +5529,10 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB100_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB100_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umax ptr %ptr, i64 %in seq_cst ret i64 %result @@ -5562,10 +5568,10 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB101_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB101_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset_scalar: @@ -5597,10 +5603,10 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB101_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB101_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset_scalar: @@ -5627,10 +5633,10 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB101_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB101_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst @@ -5666,9 +5672,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB102_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB102_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -5700,9 +5707,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB102_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB102_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -5732,9 +5740,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB102_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB102_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -5773,10 +5782,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB103_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB103_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5810,10 +5819,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB103_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB103_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5845,10 +5854,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB103_1 +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5888,10 +5897,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB104_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB104_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5923,10 +5932,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB104_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB104_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5958,10 +5967,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB104_1 +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5996,11 +6005,11 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB105_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: @@ -6025,11 +6034,11 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB105_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: @@ -6049,11 +6058,11 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB105_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6084,10 +6093,10 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB106_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: @@ -6113,10 +6122,10 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB106_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: @@ -6137,10 +6146,10 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB106_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB106_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6174,11 +6183,11 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB107_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB107_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret: @@ -6201,11 +6210,11 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB107_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB107_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret: @@ -6225,11 +6234,11 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB107_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB107_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst ret void @@ -6258,11 +6267,11 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB108_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB108_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset: @@ -6287,11 +6296,11 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB108_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB108_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset: @@ -6311,11 +6320,11 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB108_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB108_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst @@ -6344,10 +6353,10 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB109_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB109_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6373,10 +6382,10 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB109_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB109_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6399,10 +6408,10 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB109_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB109_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6434,10 +6443,10 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB110_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB110_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset: @@ -6463,10 +6472,10 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB110_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB110_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset: @@ -6487,10 +6496,10 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB110_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB110_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6528,11 +6537,11 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB111_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB111_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret_scalar: @@ -6563,11 +6572,11 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB111_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB111_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret_scalar: @@ -6593,11 +6602,11 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB111_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB111_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst ret void @@ -6632,11 +6641,11 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB112_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB112_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset_scalar: @@ -6667,11 +6676,11 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB112_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB112_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset_scalar: @@ -6697,11 +6706,11 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB112_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB112_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst @@ -6738,10 +6747,10 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB113_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB113_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_ret_scalar: @@ -6773,10 +6782,10 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB113_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB113_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_ret_scalar: @@ -6803,10 +6812,10 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB113_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB113_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umin ptr %ptr, i64 %in seq_cst ret i64 %result @@ -6842,10 +6851,10 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB114_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB114_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset_scalar: @@ -6877,10 +6886,10 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB114_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB114_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset_scalar: @@ -6907,10 +6916,10 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB114_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB114_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst @@ -6940,11 +6949,11 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB115_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: @@ -6969,11 +6978,11 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB115_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: @@ -6993,11 +7002,11 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB115_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7028,10 +7037,10 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB116_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: @@ -7057,10 +7066,10 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB116_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: @@ -7081,10 +7090,10 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB116_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -7118,11 +7127,11 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB117_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB117_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret: @@ -7145,11 +7154,11 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB117_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB117_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret: @@ -7169,11 +7178,11 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB117_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB117_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst ret void @@ -7202,11 +7211,11 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB118_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB118_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret_offset: @@ -7231,11 +7240,11 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB118_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB118_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret_offset: @@ -7255,11 +7264,11 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB118_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB118_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst @@ -7288,10 +7297,10 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB119_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB119_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7317,10 +7326,10 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB119_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB119_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7343,10 +7352,10 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB119_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB119_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -7378,10 +7387,10 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB120_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB120_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_ret_offset: @@ -7407,10 +7416,10 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB120_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB120_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_ret_offset: @@ -7431,10 +7440,10 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB120_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB120_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -7472,11 +7481,11 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB121_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB121_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret_scalar: @@ -7507,11 +7516,11 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB121_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB121_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret_scalar: @@ -7537,11 +7546,11 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB121_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB121_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst ret void @@ -7576,11 +7585,11 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB122_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB122_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret_offset_scalar: @@ -7611,11 +7620,11 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB122_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB122_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret_offset_scalar: @@ -7641,11 +7650,11 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB122_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB122_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst @@ -7682,10 +7691,10 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB123_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB123_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_ret_scalar: @@ -7717,10 +7726,10 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB123_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB123_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_ret_scalar: @@ -7747,10 +7756,10 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB123_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB123_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw min ptr %ptr, i64 %in seq_cst ret i64 %result @@ -7786,10 +7795,10 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB124_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB124_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_ret_offset_scalar: @@ -7821,10 +7830,10 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB124_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB124_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_ret_offset_scalar: @@ -7851,10 +7860,10 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB124_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB124_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst @@ -7890,9 +7899,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB125_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB125_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -7924,9 +7934,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB125_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB125_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -7956,9 +7967,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB125_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB125_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -7997,10 +8009,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB126_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB126_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8034,10 +8046,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB126_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB126_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8069,10 +8081,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB126_1 +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8110,9 +8122,10 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB127_1 +; GCN1-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB127_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -8140,9 +8153,10 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB127_1 +; GCN2-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB127_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -8170,9 +8184,10 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB127_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -8207,10 +8222,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB128_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB128_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8242,10 +8257,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB128_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB128_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8277,10 +8292,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB128_1 +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8315,11 +8330,11 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB129_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB129_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: @@ -8344,11 +8359,11 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB129_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB129_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: @@ -8368,11 +8383,11 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB129_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB129_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -8403,10 +8418,10 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB130_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB130_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: @@ -8432,10 +8447,10 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB130_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB130_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: @@ -8456,10 +8471,10 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB130_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB130_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index b32630a97b3ad0..f7ce48c55c5643 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -3997,10 +3997,12 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, v2, v3 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_cbranch_execz .LBB81_2 +; SI-NEXT: s_mov_b64 s[4:5], exec +; SI-NEXT: s_and_b64 s[6:7], vcc, -1 +; SI-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB81_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: v_cvt_f16_f32_e64 v3, -v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -4010,8 +4012,8 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: flat_store_short v[0:1], v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: .LBB81_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: .LBB81_2: ; %endif ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: flat_store_short v[0:1], v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -4024,16 +4026,18 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half ; VI-NEXT: v_lshlrev_b32_e32 v6, 1, v6 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f16_e32 v2, v2, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB81_2 +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: v_mul_f16_e32 v2, v2, v3 +; VI-NEXT: s_cmp_lg_u64 vcc, 0 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB81_2 ; VI-NEXT: ; %bb.1: ; %if ; VI-NEXT: v_mul_f16_e64 v3, -v2, v4 ; VI-NEXT: flat_store_short v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: .LBB81_2: ; %endif ; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB81_2: ; %endif ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4042,20 +4046,23 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v6, 0x3ff, v31 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 ; GFX11-NEXT: v_mul_f16_e32 v2, v2, v3 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 1, v6 -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v5 -; GFX11-NEXT: s_cbranch_execz .LBB81_2 +; GFX11-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11-NEXT: v_add_co_u32 v0, s0, v0, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, v1, s0 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB81_2 ; GFX11-NEXT: ; %bb.1: ; %if ; GFX11-NEXT: v_mul_f16_e64 v3, -v2, v4 ; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: .LBB81_2: ; %endif -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 74e2b9ea714258..21b647f31f35f3 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -2380,16 +2380,18 @@ define void @v_fneg_copytoreg_f32(ptr addrspace(1) %out, float %a, float %b, flo ; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v6 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, v2, v3 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_cbranch_execz .LBB118_2 +; SI-NEXT: s_mov_b64 s[4:5], exec +; SI-NEXT: s_and_b64 s[6:7], vcc, -1 +; SI-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB118_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: v_mul_f32_e64 v3, -v2, v4 ; SI-NEXT: flat_store_dword v[0:1], v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: .LBB118_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: .LBB118_2: ; %endif ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2401,16 +2403,18 @@ define void @v_fneg_copytoreg_f32(ptr addrspace(1) %out, float %a, float %b, flo ; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v6 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, v2, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB118_2 +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: v_mul_f32_e32 v2, v2, v3 +; VI-NEXT: s_cmp_lg_u64 vcc, 0 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB118_2 ; VI-NEXT: ; %bb.1: ; %if ; VI-NEXT: v_mul_f32_e64 v3, -v2, v4 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: .LBB118_2: ; %endif ; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB118_2: ; %endif ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll index a04bf445493253..9ed26646c4f2cc 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll @@ -9,11 +9,14 @@ define float @fold_abs_in_branch(float %arg1, float %arg2) { ; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX10-NEXT: v_add_f32_e64 v0, |v1|, |v1| -; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 +; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 1.0, v0 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, |v1| -; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB0_2: ; %exit ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2 @@ -40,11 +43,14 @@ define float @fold_abs_in_branch_multiple_users(float %arg1, float %arg2) { ; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_add_f32_e64 v1, |v0|, |v0| -; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v1 +; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 1.0, v1 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_mul_f32_e64 v1, 0x3e4ccccd, |v0| -; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB1_2: ; %exit ; GFX10-NEXT: v_add_f32_e64 v0, |v0|, 2.0 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -125,11 +131,14 @@ define float @fold_abs_in_branch_fabs(float %arg1, float %arg2) { ; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX10-NEXT: v_add_f32_e64 v0, |v1|, |v1| -; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 +; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 1.0, v0 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, |v1| -; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB4_2: ; %exit ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2 @@ -157,8 +166,10 @@ define float @fold_abs_in_branch_phi(float %arg1, float %arg2) { ; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_add_f32_e64 v0, |v0|, |v0| -; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 -; GFX10-NEXT: s_cbranch_execz .LBB5_3 +; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 1.0, v0 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX10-NEXT: ; %bb.1: ; %header.preheader ; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: .LBB5_2: ; %header @@ -167,8 +178,9 @@ define float @fold_abs_in_branch_phi(float %arg1, float %arg2) { ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, -1.0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX10-NEXT: s_cbranch_vccnz .LBB5_2 -; GFX10-NEXT: .LBB5_3: ; %Flow1 +; GFX10-NEXT: ; %bb.3: ; %Flow ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB5_4: ; %exit ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2 @@ -201,13 +213,16 @@ define float @fold_neg_in_branch(float %arg1, float %arg2) { ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 1.0, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_rcp_f32_e64 v1, -v0 ; GFX10-NEXT: v_mul_f32_e64 v1, |v0|, v1 -; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB6_2: ; %exit ; GFX10-NEXT: v_mul_f32_e64 v0, -v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 957c10ddf85e5d..dd83716fc4cae5 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1321,9 +1321,10 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX90A-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -1533,9 +1534,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX90A-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX90A-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 667a3f398c08a2..372838233b2c47 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -10,14 +10,16 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_bfe_u32 v6, v5, 20, 11 ; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0x3fe -; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SDAG-NEXT: s_cbranch_execz .LBB0_10 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc @@ -29,26 +31,29 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] ; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[10:11], s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB0_7 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 ; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 -; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] +; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 +; SDAG-NEXT: s_xor_b64 s[12:13], s[4:5], exec ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB0_4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0x473, v6 ; SDAG-NEXT: v_add_u32_e32 v2, 0xfffffb8d, v6 @@ -90,9 +95,12 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB0_4: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB0_6 +; SDAG-NEXT: s_xor_b64 s[14:15], s[12:13], exec +; SDAG-NEXT: s_cmp_lg_u64 s[12:13], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[12:13] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] @@ -114,10 +122,14 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] ; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] ; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 +; SDAG-NEXT: s_or_b64 exec, exec, s[14:15] ; SDAG-NEXT: .LBB0_6: ; %Flow1 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB0_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_cmp_lg_u64 s[10:11], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_9 ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 @@ -125,10 +137,10 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 -; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup +; SDAG-NEXT: .LBB0_9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fptosi_f64_to_i128: @@ -139,17 +151,19 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5 ; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB0_10 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_10 ; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 @@ -166,10 +180,11 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: s_xor_b64 s[14:15], vcc, exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB0_7 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_7 ; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] @@ -228,13 +243,14 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_or3_b32 v9, v0, v2, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x433 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5 +; GISEL-NEXT: s_xor_b64 s[16:17], vcc, exec ; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v2 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB0_4 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] @@ -268,9 +284,12 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: s_or_b64 exec, exec, s[16:17] ; GISEL-NEXT: .LBB0_4: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] -; GISEL-NEXT: s_cbranch_execz .LBB0_6 +; GISEL-NEXT: s_xor_b64 s[8:9], s[16:17], exec +; GISEL-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[16:17] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, 0x433, v6 ; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v6 @@ -291,11 +310,14 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc -; GISEL-NEXT: .LBB0_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB0_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB0_7: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] -; GISEL-NEXT: s_cbranch_execz .LBB0_9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[14:15], exec +; GISEL-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[14:15] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_9 ; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 @@ -365,10 +387,10 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GISEL-NEXT: .LBB0_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: .LBB0_10: ; %fp-to-i-cleanup +; GISEL-NEXT: .LBB0_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB0_10: ; %fp-to-i-cleanup ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = fptosi double %x to i128 ret i128 %cvt @@ -382,14 +404,16 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_bfe_u32 v6, v5, 20, 11 ; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0x3fe -; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SDAG-NEXT: s_cbranch_execz .LBB1_10 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc @@ -401,26 +425,29 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] ; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[10:11], s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB1_7 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 ; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 -; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] +; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 +; SDAG-NEXT: s_xor_b64 s[12:13], s[4:5], exec ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB1_4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0x473, v6 ; SDAG-NEXT: v_add_u32_e32 v2, 0xfffffb8d, v6 @@ -462,9 +489,12 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB1_4: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB1_6 +; SDAG-NEXT: s_xor_b64 s[14:15], s[12:13], exec +; SDAG-NEXT: s_cmp_lg_u64 s[12:13], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[12:13] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] @@ -486,10 +516,14 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] ; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] ; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 +; SDAG-NEXT: s_or_b64 exec, exec, s[14:15] ; SDAG-NEXT: .LBB1_6: ; %Flow1 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB1_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_cmp_lg_u64 s[10:11], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_9 ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 @@ -497,10 +531,10 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 -; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup +; SDAG-NEXT: .LBB1_9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fptoui_f64_to_i128: @@ -511,17 +545,19 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5 ; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB1_10 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_10 ; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 @@ -538,10 +574,11 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: s_xor_b64 s[14:15], vcc, exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB1_7 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_7 ; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] @@ -600,13 +637,14 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_or3_b32 v9, v0, v2, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x433 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5 +; GISEL-NEXT: s_xor_b64 s[16:17], vcc, exec ; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v2 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB1_4 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] @@ -640,9 +678,12 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: s_or_b64 exec, exec, s[16:17] ; GISEL-NEXT: .LBB1_4: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] -; GISEL-NEXT: s_cbranch_execz .LBB1_6 +; GISEL-NEXT: s_xor_b64 s[8:9], s[16:17], exec +; GISEL-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[16:17] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, 0x433, v6 ; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v6 @@ -663,11 +704,14 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc -; GISEL-NEXT: .LBB1_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB1_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB1_7: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] -; GISEL-NEXT: s_cbranch_execz .LBB1_9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[14:15], exec +; GISEL-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[14:15] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_9 ; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 @@ -737,10 +781,10 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GISEL-NEXT: .LBB1_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: .LBB1_10: ; %fp-to-i-cleanup +; GISEL-NEXT: .LBB1_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB1_10: ; %fp-to-i-cleanup ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = fptoui double %x to i128 ret i128 %cvt @@ -753,14 +797,16 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SDAG-NEXT: s_cbranch_execz .LBB2_10 +; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc @@ -772,27 +818,30 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 ; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[10:11], s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB2_7 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 ; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 -; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 +; SDAG-NEXT: s_xor_b64 s[12:13], s[4:5], exec ; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB2_4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5 @@ -830,12 +879,15 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB2_4: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB2_6 +; SDAG-NEXT: s_xor_b64 s[6:7], s[12:13], exec +; SDAG-NEXT: s_cmp_lg_u64 s[12:13], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[12:13] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] @@ -849,10 +901,14 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 -; SDAG-NEXT: .LBB2_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB2_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB2_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_cmp_lg_u64 s[10:11], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_9 ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 @@ -860,10 +916,10 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 -; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB2_10: ; %fp-to-i-cleanup +; SDAG-NEXT: .LBB2_9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB2_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fptosi_f32_to_i128: @@ -872,19 +928,21 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_mov_b32_e32 v4, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, 0 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 23, v[4:5] -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_mov_b32_e32 v7, v5 ; GISEL-NEXT: v_bfe_u32 v6, v0, 0, 8 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7f ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB2_10 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_10 ; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 @@ -901,10 +959,11 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: s_xor_b64 s[14:15], vcc, exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB2_7 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_7 ; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] @@ -963,14 +1022,15 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x96 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 +; GISEL-NEXT: s_xor_b64 s[16:17], vcc, exec ; GISEL-NEXT: v_or_b32_e32 v4, 0x800000, v2 ; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB2_4 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] @@ -1004,9 +1064,12 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: s_or_b64 exec, exec, s[16:17] ; GISEL-NEXT: .LBB2_4: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] -; GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GISEL-NEXT: s_xor_b64 s[6:7], s[16:17], exec +; GISEL-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[16:17] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x96, v6 ; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 @@ -1021,11 +1084,14 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10 ; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: .LBB2_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB2_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB2_7: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] -; GISEL-NEXT: s_cbranch_execz .LBB2_9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[14:15], exec +; GISEL-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[14:15] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_9 ; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 @@ -1095,10 +1161,10 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GISEL-NEXT: .LBB2_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: .LBB2_10: ; %fp-to-i-cleanup +; GISEL-NEXT: .LBB2_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB2_10: ; %fp-to-i-cleanup ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = fptosi float %x to i128 ret i128 %cvt @@ -1111,14 +1177,16 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SDAG-NEXT: s_cbranch_execz .LBB3_10 +; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc @@ -1130,27 +1198,30 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 ; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[10:11], s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB3_7 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 ; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 -; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 +; SDAG-NEXT: s_xor_b64 s[12:13], s[4:5], exec ; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB3_4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5 @@ -1188,12 +1259,15 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB3_4: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB3_6 +; SDAG-NEXT: s_xor_b64 s[6:7], s[12:13], exec +; SDAG-NEXT: s_cmp_lg_u64 s[12:13], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[12:13] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] @@ -1207,10 +1281,14 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 -; SDAG-NEXT: .LBB3_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB3_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB3_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_cmp_lg_u64 s[10:11], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_9 ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 @@ -1218,10 +1296,10 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 -; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB3_10: ; %fp-to-i-cleanup +; SDAG-NEXT: .LBB3_9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB3_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fptoui_f32_to_i128: @@ -1230,19 +1308,21 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_mov_b32_e32 v4, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, 0 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 23, v[4:5] -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_mov_b32_e32 v7, v5 ; GISEL-NEXT: v_bfe_u32 v6, v0, 0, 8 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7f ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB3_10 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_10 ; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 @@ -1259,10 +1339,11 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: s_xor_b64 s[14:15], vcc, exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB3_7 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_7 ; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] @@ -1321,14 +1402,15 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x96 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 +; GISEL-NEXT: s_xor_b64 s[16:17], vcc, exec ; GISEL-NEXT: v_or_b32_e32 v4, 0x800000, v2 ; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB3_4 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] @@ -1362,9 +1444,12 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: s_or_b64 exec, exec, s[16:17] ; GISEL-NEXT: .LBB3_4: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] -; GISEL-NEXT: s_cbranch_execz .LBB3_6 +; GISEL-NEXT: s_xor_b64 s[6:7], s[16:17], exec +; GISEL-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[16:17] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x96, v6 ; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 @@ -1379,11 +1464,14 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10 ; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: .LBB3_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB3_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB3_7: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] -; GISEL-NEXT: s_cbranch_execz .LBB3_9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[14:15], exec +; GISEL-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[14:15] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_9 ; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 @@ -1453,10 +1541,10 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GISEL-NEXT: .LBB3_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: .LBB3_10: ; %fp-to-i-cleanup +; GISEL-NEXT: .LBB3_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB3_10: ; %fp-to-i-cleanup ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = fptoui float %x to i128 ret i128 %cvt @@ -1497,14 +1585,16 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_bfe_u32 v5, v4, 7, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SDAG-NEXT: s_cbranch_execz .LBB6_10 +; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB6_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc @@ -1516,25 +1606,28 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] ; SDAG-NEXT: v_cmp_lt_i16_e32 vcc, -1, v4 ; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[10:11], s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB6_7 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB6_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: s_movk_i32 s4, 0x7f ; SDAG-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: s_mov_b64 s[4:5], 0x85 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: s_xor_b64 s[12:13], s[4:5], exec ; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v6, 0x80, v0 +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB6_4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB6_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v5 @@ -1573,11 +1666,15 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB6_4: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] +; SDAG-NEXT: s_xor_b64 s[6:7], s[12:13], exec +; SDAG-NEXT: s_cmp_lg_u64 s[12:13], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[12:13] +; SDAG-NEXT: s_cbranch_scc0 .LBB6_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] @@ -1589,10 +1686,14 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8 ; SDAG-NEXT: v_mov_b32_e32 v3, v2 -; SDAG-NEXT: ; %bb.6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB6_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB6_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_cmp_lg_u64 s[10:11], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB6_9 ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 @@ -1600,10 +1701,10 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 -; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB6_10: ; %fp-to-i-cleanup +; SDAG-NEXT: .LBB6_9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB6_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fptosi_bf16_to_i128: @@ -1614,17 +1715,19 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_mov_b32_e32 v6, 0 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GISEL-NEXT: v_bfe_u32 v5, v0, 0, 8 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[5:6], v[1:2] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB6_10 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB6_10 ; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 @@ -1641,10 +1744,11 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: s_xor_b64 s[14:15], vcc, exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB6_7 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB6_7 ; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] @@ -1702,14 +1806,15 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_or3_b32 v10, v11, v0, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x86 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_and_b32_e32 v2, 0x7f, v4 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[5:6], v[0:1] +; GISEL-NEXT: v_and_b32_e32 v2, 0x7f, v4 +; GISEL-NEXT: s_xor_b64 s[16:17], vcc, exec ; GISEL-NEXT: v_or_b32_e32 v7, 0x80, v2 ; GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB6_4 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB6_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff7a, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[7:8] @@ -1742,9 +1847,12 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: s_or_b64 exec, exec, s[16:17] ; GISEL-NEXT: .LBB6_4: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] -; GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GISEL-NEXT: s_xor_b64 s[6:7], s[16:17], exec +; GISEL-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[16:17] +; GISEL-NEXT: s_cbranch_scc0 .LBB6_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x86, v5 ; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 @@ -1758,11 +1866,14 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GISEL-NEXT: v_mul_i32_i24_e32 v0, v0, v9 ; GISEL-NEXT: v_mov_b32_e32 v3, v2 -; GISEL-NEXT: .LBB6_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB6_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB6_7: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] -; GISEL-NEXT: s_cbranch_execz .LBB6_9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[14:15], exec +; GISEL-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[14:15] +; GISEL-NEXT: s_cbranch_scc0 .LBB6_9 ; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 @@ -1832,10 +1943,10 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GISEL-NEXT: .LBB6_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: .LBB6_10: ; %fp-to-i-cleanup +; GISEL-NEXT: .LBB6_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB6_10: ; %fp-to-i-cleanup ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = fptosi bfloat %x to i128 ret i128 %cvt @@ -1848,14 +1959,16 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_bfe_u32 v5, v4, 7, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SDAG-NEXT: s_cbranch_execz .LBB7_10 +; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB7_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc @@ -1867,25 +1980,28 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] ; SDAG-NEXT: v_cmp_lt_i16_e32 vcc, -1, v4 ; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[10:11], s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB7_7 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB7_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: s_movk_i32 s4, 0x7f ; SDAG-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: s_mov_b64 s[4:5], 0x85 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: s_xor_b64 s[12:13], s[4:5], exec ; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v6, 0x80, v0 +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB7_4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB7_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v5 @@ -1924,11 +2040,15 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB7_4: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] +; SDAG-NEXT: s_xor_b64 s[6:7], s[12:13], exec +; SDAG-NEXT: s_cmp_lg_u64 s[12:13], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[12:13] +; SDAG-NEXT: s_cbranch_scc0 .LBB7_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] @@ -1940,10 +2060,14 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8 ; SDAG-NEXT: v_mov_b32_e32 v3, v2 -; SDAG-NEXT: ; %bb.6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB7_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB7_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_cmp_lg_u64 s[10:11], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB7_9 ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 @@ -1951,10 +2075,10 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 -; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB7_10: ; %fp-to-i-cleanup +; SDAG-NEXT: .LBB7_9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB7_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fptoui_bf16_to_i128: @@ -1965,17 +2089,19 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_mov_b32_e32 v6, 0 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GISEL-NEXT: v_bfe_u32 v5, v0, 0, 8 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[5:6], v[1:2] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB7_10 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB7_10 ; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 @@ -1992,10 +2118,11 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: s_xor_b64 s[14:15], vcc, exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB7_7 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB7_7 ; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] @@ -2053,14 +2180,15 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_or3_b32 v10, v11, v0, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x86 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_and_b32_e32 v2, 0x7f, v4 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[5:6], v[0:1] +; GISEL-NEXT: v_and_b32_e32 v2, 0x7f, v4 +; GISEL-NEXT: s_xor_b64 s[16:17], vcc, exec ; GISEL-NEXT: v_or_b32_e32 v7, 0x80, v2 ; GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB7_4 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB7_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff7a, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[7:8] @@ -2093,9 +2221,12 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: s_or_b64 exec, exec, s[16:17] ; GISEL-NEXT: .LBB7_4: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] -; GISEL-NEXT: s_cbranch_execz .LBB7_6 +; GISEL-NEXT: s_xor_b64 s[6:7], s[16:17], exec +; GISEL-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[16:17] +; GISEL-NEXT: s_cbranch_scc0 .LBB7_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x86, v5 ; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 @@ -2109,11 +2240,14 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GISEL-NEXT: v_mul_i32_i24_e32 v0, v0, v9 ; GISEL-NEXT: v_mov_b32_e32 v3, v2 -; GISEL-NEXT: .LBB7_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB7_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB7_7: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] -; GISEL-NEXT: s_cbranch_execz .LBB7_9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[14:15], exec +; GISEL-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[14:15] +; GISEL-NEXT: s_cbranch_scc0 .LBB7_9 ; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 @@ -2183,10 +2317,10 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GISEL-NEXT: .LBB7_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: .LBB7_10: ; %fp-to-i-cleanup +; GISEL-NEXT: .LBB7_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB7_10: ; %fp-to-i-cleanup ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = fptoui bfloat %x to i128 ret i128 %cvt diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 3b2f15c8340a63..be479cb039be6e 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -104,35 +104,40 @@ define void @i1_arg_i1_use(i1 %arg) #0 { ; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0 ; CIGFX89-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; CIGFX89-NEXT: s_xor_b64 s[6:7], vcc, -1 -; CIGFX89-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] -; CIGFX89-NEXT: s_cbranch_execz .LBB3_2 +; CIGFX89-NEXT: s_mov_b64 s[4:5], exec +; CIGFX89-NEXT: s_and_b64 s[6:7], s[6:7], exec +; CIGFX89-NEXT: s_cmov_b64 exec, s[6:7] +; CIGFX89-NEXT: s_cbranch_scc0 .LBB3_2 ; CIGFX89-NEXT: ; %bb.1: ; %bb1 ; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 ; CIGFX89-NEXT: s_mov_b32 s6, -1 ; CIGFX89-NEXT: v_mov_b32_e32 v0, 0 ; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CIGFX89-NEXT: s_waitcnt vmcnt(0) -; CIGFX89-NEXT: .LBB3_2: ; %bb2 ; CIGFX89-NEXT: s_or_b64 exec, exec, s[4:5] +; CIGFX89-NEXT: .LBB3_2: ; %bb2 ; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: i1_arg_i1_use: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11-NEXT: s_xor_b32 s1, vcc_lo, -1 -; GFX11-NEXT: s_and_saveexec_b32 s0, s1 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_and_b32 s1, s1, exec_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cmov_b32 exec_lo, s1 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX11-NEXT: ; %bb.1: ; %bb1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: .LBB3_2: ; %bb2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: .LBB3_2: ; %bb2 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: br i1 %arg, label %bb2, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll index ad3f920eadc91f..98b0a1abdcf424 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -85,14 +85,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %1 ; GFX908-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3.Flow: ; GFX908-NEXT: successors: %bb.4(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX908-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4 (%ir-block.33): - ; GFX908-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw @@ -146,14 +146,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %1 ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: bb.3.Flow: ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000) ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_GFX940-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.33): - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_ENDPGM 0 ; ; GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw @@ -199,14 +199,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11_GFX12-NEXT: {{ $}} ; GFX11_GFX12-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], %1, [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11_GFX12-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11_GFX12-NEXT: {{ $}} ; GFX11_GFX12-NEXT: bb.3.Flow: ; GFX11_GFX12-NEXT: successors: %bb.4(0x80000000) ; GFX11_GFX12-NEXT: {{ $}} - ; GFX11_GFX12-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX11_GFX12-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11_GFX12-NEXT: {{ $}} ; GFX11_GFX12-NEXT: bb.4 (%ir-block.26): - ; GFX11_GFX12-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11_GFX12-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index 3951e02d46a8f3..1c94587727566f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -90,26 +90,26 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2 ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.4 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4 - ; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4 (%ir-block.35): ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 - ; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX90A-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec + ; GFX90A-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5 (%ir-block.41): @@ -171,26 +171,26 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2 ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX940-NEXT: S_BRANCH %bb.4 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: bb.3.Flow: ; GFX940-NEXT: successors: %bb.5(0x80000000) ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4 - ; GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX940-NEXT: S_BRANCH %bb.5 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: bb.4 (%ir-block.35): ; GFX940-NEXT: successors: %bb.3(0x80000000) ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 - ; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX940-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec ; GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX940-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec + ; GFX940-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX940-NEXT: S_BRANCH %bb.3 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: bb.5 (%ir-block.41): @@ -252,25 +252,25 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY %2 ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY6]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.4 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.3.Flow: ; GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4 - ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.5 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.4 (%ir-block.32): ; GFX11-NEXT: successors: %bb.3(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 - ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX11-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY5]] ; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY7]], implicit $exec + ; GFX11-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.3 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.5 (%ir-block.38): diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index ba94a53dff03bd..5d9d24e69bfcbe 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -59,8 +59,6 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2.atomicrmw.end: ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]], %bb.1 - ; GFX90A-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 - ; GFX90A-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 ; GFX90A-NEXT: $sgpr0 = COPY [[COPY12]] diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll index 56b25bacd2defc..c4f952ef0ad91c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll @@ -8,9 +8,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 ; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_4 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -31,12 +33,12 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v5 ; GCN-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] -; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN-NEXT: s_cbranch_execnz .LBB0_2 +; GCN-NEXT: s_andn2_b64 s[0:1], exec, s[6:7] +; GCN-NEXT: s_cselect_b64 exec, s[0:1], s[6:7] +; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.3: ; %Flow -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-NEXT: .LBB0_4: ; %Flow2 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB0_4: ; GCN-NEXT: v_readfirstlane_b32 s0, v1 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, s0 @@ -56,8 +58,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 064238c63717ec..b565310898715a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -64,10 +64,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -86,10 +86,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -108,10 +108,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -130,10 +130,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -158,10 +158,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -187,10 +187,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB0_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -249,10 +249,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -271,10 +271,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -293,10 +293,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -317,10 +317,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -344,10 +344,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -373,10 +373,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB1_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -436,10 +436,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -458,10 +458,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -480,10 +480,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -504,10 +504,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -537,10 +537,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -571,10 +571,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB2_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 @@ -633,10 +633,10 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: @@ -653,11 +653,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: @@ -674,11 +674,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: @@ -695,11 +695,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: @@ -722,11 +722,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: @@ -750,11 +750,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB3_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -812,10 +812,10 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -832,11 +832,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -853,11 +853,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -876,11 +876,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -903,11 +903,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -931,11 +931,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB4_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -994,10 +994,10 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1014,11 +1014,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1035,11 +1035,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1058,11 +1058,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1089,11 +1089,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1121,11 +1121,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 @@ -1184,10 +1184,10 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1208,10 +1208,10 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1230,10 +1230,10 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1254,10 +1254,10 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1281,10 +1281,10 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1310,10 +1310,10 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1373,10 +1373,10 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1395,11 +1395,11 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1416,11 +1416,11 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1439,11 +1439,11 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1466,11 +1466,11 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1494,11 +1494,11 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -1548,11 +1548,11 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1573,10 +1573,10 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1595,10 +1595,10 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1617,10 +1617,10 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1641,10 +1641,10 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: @@ -1668,10 +1668,10 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1697,10 +1697,10 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1751,11 +1751,11 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1776,10 +1776,10 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1798,10 +1798,10 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1820,10 +1820,10 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1844,10 +1844,10 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: @@ -1871,10 +1871,10 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1900,10 +1900,10 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1953,11 +1953,11 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: @@ -1977,10 +1977,10 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: @@ -1997,11 +1997,11 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: @@ -2018,11 +2018,11 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: @@ -2041,11 +2041,11 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: @@ -2068,11 +2068,11 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: @@ -2096,11 +2096,11 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -2159,10 +2159,10 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2181,10 +2181,10 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2203,10 +2203,10 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2227,10 +2227,10 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: @@ -2254,10 +2254,10 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2283,10 +2283,10 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2346,10 +2346,10 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2376,10 +2376,10 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2400,10 +2400,10 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -2427,10 +2427,10 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2456,10 +2456,10 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2510,11 +2510,11 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2535,10 +2535,10 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2557,10 +2557,10 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2579,10 +2579,10 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2603,10 +2603,10 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: @@ -2630,10 +2630,10 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2659,10 +2659,10 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2712,11 +2712,11 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: @@ -2736,10 +2736,10 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: @@ -2756,11 +2756,11 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: @@ -2777,11 +2777,11 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: @@ -2800,11 +2800,11 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: @@ -2827,11 +2827,11 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: @@ -2855,11 +2855,11 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -2918,10 +2918,10 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: @@ -2938,11 +2938,11 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: @@ -2959,11 +2959,11 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: @@ -2982,11 +2982,11 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: @@ -3009,11 +3009,11 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: @@ -3037,11 +3037,11 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -3100,10 +3100,10 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -3138,11 +3138,11 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -3165,11 +3165,11 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -3193,11 +3193,11 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -3246,11 +3246,11 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -3270,10 +3270,10 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -3290,11 +3290,11 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -3311,11 +3311,11 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -3334,11 +3334,11 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -3361,11 +3361,11 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: @@ -3389,11 +3389,11 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -3443,11 +3443,11 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3468,10 +3468,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3490,10 +3490,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3512,10 +3512,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -3534,10 +3534,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3562,10 +3562,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -3591,10 +3591,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3643,11 +3643,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -3667,10 +3667,10 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -3687,11 +3687,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -3708,11 +3708,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -3729,11 +3729,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -3756,11 +3756,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: @@ -3784,11 +3784,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB19_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -3837,11 +3837,11 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3862,10 +3862,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3884,10 +3884,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3906,10 +3906,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -3928,10 +3928,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3956,10 +3956,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -3985,10 +3985,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB20_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4037,11 +4037,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4061,10 +4061,10 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4081,11 +4081,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4102,11 +4102,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4123,11 +4123,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4150,11 +4150,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: @@ -4178,11 +4178,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB21_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -4240,10 +4240,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4262,10 +4262,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4284,10 +4284,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4306,10 +4306,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4334,10 +4334,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4363,10 +4363,10 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4425,10 +4425,10 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: @@ -4445,11 +4445,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: @@ -4466,11 +4466,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: @@ -4487,11 +4487,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: @@ -4514,11 +4514,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: @@ -4542,11 +4542,11 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.no.fine.grained.memory !0 @@ -4608,10 +4608,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4638,10 +4638,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4660,10 +4660,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4688,10 +4688,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4717,10 +4717,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4779,10 +4779,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4809,10 +4809,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4833,10 +4833,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -4860,10 +4860,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4889,10 +4889,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4952,10 +4952,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4982,10 +4982,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5006,10 +5006,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -5039,10 +5039,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -5073,10 +5073,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 @@ -5135,10 +5135,10 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -5171,11 +5171,11 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -5198,11 +5198,11 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -5226,11 +5226,11 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -5288,10 +5288,10 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5326,11 +5326,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5353,11 +5353,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5381,11 +5381,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -5444,10 +5444,10 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -5482,11 +5482,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -5513,11 +5513,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -5545,11 +5545,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 @@ -5608,10 +5608,10 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5640,10 +5640,10 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5664,10 +5664,10 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5691,10 +5691,10 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5720,10 +5720,10 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5783,10 +5783,10 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5823,11 +5823,11 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5850,11 +5850,11 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5878,11 +5878,11 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -5941,10 +5941,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5971,10 +5971,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5995,10 +5995,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -6022,10 +6022,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6051,10 +6051,10 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -6114,10 +6114,10 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -6152,11 +6152,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -6179,11 +6179,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -6207,11 +6207,11 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -6261,11 +6261,11 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6286,10 +6286,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6308,10 +6308,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6330,10 +6330,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6352,10 +6352,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6380,10 +6380,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6409,10 +6409,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -6461,11 +6461,11 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: @@ -6485,10 +6485,10 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: @@ -6505,11 +6505,11 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: @@ -6526,11 +6526,11 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: @@ -6547,11 +6547,11 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: @@ -6574,11 +6574,11 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: @@ -6602,11 +6602,11 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB35_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -6664,10 +6664,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6694,10 +6694,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6716,10 +6716,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6744,10 +6744,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6773,10 +6773,10 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB36_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -6835,10 +6835,10 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -6871,11 +6871,11 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -6898,11 +6898,11 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -6926,11 +6926,11 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB37_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -6964,11 +6964,11 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6999,11 +6999,11 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7025,10 +7025,10 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7057,10 +7057,10 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -7081,10 +7081,10 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -7117,10 +7117,10 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: @@ -7152,10 +7152,10 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB38_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7185,11 +7185,11 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7220,11 +7220,11 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7246,10 +7246,10 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7278,10 +7278,10 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -7304,10 +7304,10 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7338,10 +7338,10 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7373,10 +7373,10 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB39_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 @@ -7407,11 +7407,11 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7442,11 +7442,11 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7468,10 +7468,10 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7500,10 +7500,10 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -7526,10 +7526,10 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7564,10 +7564,10 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7603,10 +7603,10 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB40_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 @@ -7636,11 +7636,11 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -7669,11 +7669,11 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -7694,10 +7694,10 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -7723,11 +7723,11 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -7745,11 +7745,11 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -7775,11 +7775,11 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v7, v9 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -7806,11 +7806,11 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v7, v9 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB41_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7839,11 +7839,11 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7872,11 +7872,11 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7897,10 +7897,10 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7926,11 +7926,11 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7950,11 +7950,11 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7980,11 +7980,11 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v7, v9 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8011,11 +8011,11 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v7, v9 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB42_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 @@ -8045,11 +8045,11 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8078,11 +8078,11 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8103,10 +8103,10 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8132,11 +8132,11 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8156,11 +8156,11 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8190,11 +8190,11 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v7, v9 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8225,11 +8225,11 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v7, v9 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB43_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 @@ -8278,11 +8278,11 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -8291,32 +8291,32 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: global_load_dword v5, v[0:1], off ; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -8351,11 +8351,11 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8385,10 +8385,10 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8397,31 +8397,31 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -8429,31 +8429,31 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v5, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -8482,10 +8482,10 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8523,11 +8523,11 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8566,11 +8566,11 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB44_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -8617,11 +8617,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -8629,35 +8629,35 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8693,11 +8693,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8728,10 +8728,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8741,31 +8741,31 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8774,31 +8774,31 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v5, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8828,10 +8828,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8870,11 +8870,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8915,11 +8915,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB45_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -8967,11 +8967,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -8980,35 +8980,35 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9044,11 +9044,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9079,10 +9079,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9092,31 +9092,31 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9125,31 +9125,31 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v5, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9179,10 +9179,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -9221,11 +9221,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -9266,11 +9266,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB46_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -9316,11 +9316,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9348,11 +9348,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9386,11 +9386,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9419,10 +9419,10 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9449,11 +9449,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9480,11 +9480,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9512,11 +9512,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9553,11 +9553,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9595,11 +9595,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9643,11 +9643,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9677,11 +9677,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9716,11 +9716,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9750,10 +9750,10 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9781,11 +9781,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9813,11 +9813,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9846,11 +9846,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9888,11 +9888,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9931,11 +9931,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -9980,11 +9980,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10015,11 +10015,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10054,11 +10054,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10088,10 +10088,10 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10119,11 +10119,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10151,11 +10151,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10184,11 +10184,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10226,11 +10226,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10269,11 +10269,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 @@ -10307,11 +10307,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10333,10 +10333,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10361,11 +10361,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10388,10 +10388,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10412,10 +10412,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10436,10 +10436,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -10462,10 +10462,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10494,11 +10494,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -10529,11 +10529,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -10567,11 +10567,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10591,11 +10591,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10618,11 +10618,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10644,10 +10644,10 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10666,11 +10666,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10689,11 +10689,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10714,11 +10714,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10747,11 +10747,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10781,11 +10781,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -10831,11 +10831,11 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10843,35 +10843,35 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10907,11 +10907,11 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10942,10 +10942,10 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10955,33 +10955,33 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10990,31 +10990,31 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v5, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11044,10 +11044,10 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -11086,11 +11086,11 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -11131,11 +11131,11 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -11182,11 +11182,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11216,11 +11216,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11255,11 +11255,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11289,10 +11289,10 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11322,11 +11322,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11354,11 +11354,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11387,11 +11387,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11429,11 +11429,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11472,11 +11472,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -11533,11 +11533,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -11576,10 +11576,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -11624,11 +11624,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11664,10 +11664,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -11703,10 +11703,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11742,10 +11742,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -11782,10 +11782,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -11823,11 +11823,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -11866,11 +11866,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -11926,11 +11926,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -11971,10 +11971,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12021,11 +12021,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12062,10 +12062,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12102,10 +12102,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12142,10 +12142,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12183,10 +12183,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -12225,11 +12225,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -12270,11 +12270,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -12331,11 +12331,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -12377,10 +12377,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12427,11 +12427,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12468,10 +12468,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12508,10 +12508,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12548,10 +12548,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12589,10 +12589,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -12631,11 +12631,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -12676,11 +12676,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -12734,11 +12734,11 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -12775,11 +12775,11 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -12822,11 +12822,11 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -12861,10 +12861,10 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -12898,11 +12898,11 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -12936,11 +12936,11 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -12975,11 +12975,11 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -13016,11 +13016,11 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -13058,11 +13058,11 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -13115,11 +13115,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13158,11 +13158,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB58_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13207,11 +13207,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13247,10 +13247,10 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB58_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13285,11 +13285,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13324,11 +13324,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB58_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13364,11 +13364,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB58_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13406,11 +13406,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13449,11 +13449,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB58_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -13507,11 +13507,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13551,11 +13551,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB59_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13600,11 +13600,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13640,10 +13640,10 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB59_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13678,11 +13678,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13717,11 +13717,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB59_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13757,11 +13757,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB59_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13799,11 +13799,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13842,11 +13842,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB59_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 @@ -13889,11 +13889,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -13925,10 +13925,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB60_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -13963,11 +13963,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -13997,10 +13997,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB60_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14030,10 +14030,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14063,10 +14063,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB60_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14097,10 +14097,10 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB60_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -14129,11 +14129,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -14164,11 +14164,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB60_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -14211,11 +14211,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -14245,11 +14245,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB61_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -14282,11 +14282,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -14315,10 +14315,10 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB61_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -14346,11 +14346,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -14378,11 +14378,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB61_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -14411,11 +14411,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB61_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -14444,11 +14444,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -14478,11 +14478,11 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB61_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -14537,11 +14537,11 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB62_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -14582,10 +14582,10 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB62_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -14632,11 +14632,11 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB62_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14673,10 +14673,10 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB62_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14715,10 +14715,10 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB62_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14755,10 +14755,10 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB62_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14796,10 +14796,10 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB62_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -14838,11 +14838,11 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB62_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -14883,11 +14883,11 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB62_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB62_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -14943,11 +14943,11 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB63_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14986,11 +14986,11 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB63_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15035,11 +15035,11 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB63_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15075,10 +15075,10 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB63_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15115,11 +15115,11 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB63_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15154,11 +15154,11 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB63_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15194,11 +15194,11 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB63_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15236,11 +15236,11 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB63_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15279,11 +15279,11 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB63_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB63_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -15337,11 +15337,11 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB64_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB64_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -15362,10 +15362,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB64_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB64_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -15392,10 +15392,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB64_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB64_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -15416,10 +15416,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB64_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB64_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -15447,27 +15447,27 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB64_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB64_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -15493,31 +15493,31 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB64_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB64_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -15568,11 +15568,11 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB65_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB65_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -15593,10 +15593,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB65_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB65_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -15623,10 +15623,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB65_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB65_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -15649,10 +15649,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB65_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB65_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15679,27 +15679,27 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB65_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB65_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -15725,31 +15725,31 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB65_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB65_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -15801,11 +15801,11 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB66_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB66_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -15826,10 +15826,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB66_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB66_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -15856,10 +15856,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB66_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB66_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -15882,10 +15882,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB66_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB66_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15916,27 +15916,27 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB66_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB66_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15964,31 +15964,31 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB66_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB66_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 @@ -16037,11 +16037,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB67_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB67_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -16061,10 +16061,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB67_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB67_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -16099,11 +16099,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB67_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB67_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -16130,27 +16130,27 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB67_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB67_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -16174,31 +16174,31 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB67_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB67_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16246,11 +16246,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB68_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB68_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16270,10 +16270,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB68_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB68_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16310,11 +16310,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB68_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB68_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16341,27 +16341,27 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB68_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB68_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16385,31 +16385,31 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB68_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB68_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -16458,11 +16458,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB69_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB69_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16482,10 +16482,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB69_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB69_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16522,11 +16522,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB69_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB69_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16557,27 +16557,27 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB69_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB69_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16605,31 +16605,31 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB69_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB69_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 @@ -16679,11 +16679,11 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB70_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB70_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -16704,10 +16704,10 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB70_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB70_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -16736,10 +16736,10 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB70_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB70_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -16762,10 +16762,10 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB70_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB70_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16792,27 +16792,27 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB70_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB70_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -16838,31 +16838,31 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB70_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB70_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -16913,11 +16913,11 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB71_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB71_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16937,10 +16937,10 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB71_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB71_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16979,11 +16979,11 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB71_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB71_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17010,27 +17010,27 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB71_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB71_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17054,31 +17054,31 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB71_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB71_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -17128,11 +17128,11 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB72_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB72_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -17153,10 +17153,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB72_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB72_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -17175,10 +17175,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB72_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB72_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17197,10 +17197,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB72_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB72_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -17221,10 +17221,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB72_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB72_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -17252,27 +17252,27 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB72_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB72_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -17298,31 +17298,31 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB72_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB72_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -17372,11 +17372,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB73_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB73_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -17396,10 +17396,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB73_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB73_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -17416,11 +17416,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB73_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB73_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -17437,11 +17437,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB73_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB73_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -17460,11 +17460,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB73_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB73_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -17491,27 +17491,27 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB73_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB73_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: @@ -17535,31 +17535,31 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB73_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB73_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -17608,11 +17608,11 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB74_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB74_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -17633,10 +17633,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB74_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB74_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -17663,10 +17663,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB74_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB74_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -17687,10 +17687,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB74_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB74_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -17718,27 +17718,27 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB74_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB74_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -17764,31 +17764,31 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB74_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB74_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -17838,11 +17838,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB75_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB75_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -17862,10 +17862,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB75_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB75_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -17900,11 +17900,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB75_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB75_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -17931,27 +17931,27 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB75_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB75_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -17975,31 +17975,31 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB75_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB75_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -18048,11 +18048,11 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB76_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB76_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -18073,10 +18073,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB76_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB76_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18095,10 +18095,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB76_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB76_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -18117,10 +18117,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB76_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB76_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -18141,10 +18141,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB76_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB76_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18172,27 +18172,27 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB76_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB76_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -18218,31 +18218,31 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB76_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB76_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -18292,11 +18292,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB77_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB77_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: @@ -18316,10 +18316,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB77_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB77_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: @@ -18336,11 +18336,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB77_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB77_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: @@ -18357,11 +18357,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB77_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB77_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: @@ -18380,11 +18380,11 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB77_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB77_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: @@ -18411,27 +18411,27 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB77_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB77_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: @@ -18455,31 +18455,31 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB77_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB77_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst @@ -18554,12 +18554,12 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB78_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB78_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -18596,10 +18596,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB78_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB78_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18636,10 +18636,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB78_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -18676,10 +18676,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB78_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB78_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -18717,10 +18717,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB78_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB78_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18758,13 +18758,13 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB78_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB78_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -18804,13 +18804,13 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB78_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB78_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -18883,12 +18883,12 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB79_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB79_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -18925,10 +18925,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB79_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB79_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18965,10 +18965,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB79_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB79_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -19005,10 +19005,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB79_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB79_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -19048,10 +19048,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB79_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB79_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19088,13 +19088,13 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB79_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB79_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -19134,13 +19134,13 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB79_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB79_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -19214,12 +19214,12 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB80_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB80_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -19256,10 +19256,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB80_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB80_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -19296,10 +19296,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB80_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB80_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -19336,10 +19336,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB80_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB80_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -19379,10 +19379,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB80_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB80_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -19423,13 +19423,13 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB80_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB80_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -19471,13 +19471,13 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB80_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB80_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 @@ -19548,12 +19548,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB81_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB81_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -19589,10 +19589,10 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB81_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB81_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -19627,11 +19627,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB81_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB81_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -19666,11 +19666,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB81_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB81_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -19706,11 +19706,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB81_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB81_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -19747,13 +19747,13 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB81_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB81_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -19791,13 +19791,13 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB81_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB81_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -19867,12 +19867,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB82_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB82_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19908,10 +19908,10 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB82_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB82_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19946,11 +19946,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB82_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB82_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19985,11 +19985,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB82_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB82_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -20027,11 +20027,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB82_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB82_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -20068,13 +20068,13 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB82_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB82_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -20112,13 +20112,13 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB82_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB82_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 @@ -20189,12 +20189,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB83_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB83_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -20230,10 +20230,10 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB83_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB83_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -20268,11 +20268,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB83_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB83_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -20307,11 +20307,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB83_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB83_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -20349,11 +20349,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB83_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB83_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -20394,13 +20394,13 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB83_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB83_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -20442,13 +20442,13 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB83_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB83_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 @@ -20520,12 +20520,12 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB84_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB84_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -20562,10 +20562,10 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB84_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB84_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -20604,10 +20604,10 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB84_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB84_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -20644,10 +20644,10 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB84_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB84_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -20687,10 +20687,10 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB84_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB84_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -20727,13 +20727,13 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB84_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB84_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -20773,13 +20773,13 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB84_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB84_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -20852,12 +20852,12 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB85_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB85_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -20893,10 +20893,10 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB85_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB85_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -20933,11 +20933,11 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB85_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -20972,11 +20972,11 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB85_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB85_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -21014,11 +21014,11 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB85_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB85_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -21055,13 +21055,13 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB85_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB85_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -21099,13 +21099,13 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB85_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB85_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 @@ -21177,12 +21177,12 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB86_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB86_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -21219,10 +21219,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB86_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB86_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -21259,10 +21259,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB86_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -21299,10 +21299,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB86_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB86_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -21340,10 +21340,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB86_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB86_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -21381,13 +21381,13 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB86_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB86_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -21427,13 +21427,13 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB86_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB86_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -21505,12 +21505,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB87_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB87_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -21546,10 +21546,10 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB87_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB87_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -21584,11 +21584,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB87_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -21623,11 +21623,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB87_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB87_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -21663,11 +21663,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB87_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB87_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -21704,13 +21704,13 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB87_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB87_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: @@ -21748,13 +21748,13 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB87_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB87_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -21825,12 +21825,12 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB88_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB88_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -21867,10 +21867,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB88_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB88_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -21907,10 +21907,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB88_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -21947,10 +21947,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB88_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB88_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -21988,10 +21988,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB88_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB88_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -22029,13 +22029,13 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB88_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB88_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -22075,13 +22075,13 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB88_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB88_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -22153,12 +22153,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB89_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB89_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -22194,10 +22194,10 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB89_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB89_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -22232,11 +22232,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB89_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB89_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -22271,11 +22271,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB89_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB89_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -22311,11 +22311,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB89_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB89_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -22352,13 +22352,13 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB89_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB89_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -22396,13 +22396,13 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB89_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB89_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -22473,12 +22473,12 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB90_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB90_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -22515,10 +22515,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB90_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB90_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -22555,10 +22555,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB90_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB90_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -22595,10 +22595,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB90_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB90_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -22636,10 +22636,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB90_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB90_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -22677,13 +22677,13 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB90_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB90_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -22723,13 +22723,13 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB90_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB90_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -22801,12 +22801,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB91_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB91_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: @@ -22842,10 +22842,10 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB91_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB91_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: @@ -22880,11 +22880,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB91_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB91_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: @@ -22919,11 +22919,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB91_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB91_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: @@ -22959,11 +22959,11 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB91_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB91_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: @@ -23000,13 +23000,13 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB91_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB91_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: @@ -23044,13 +23044,13 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB91_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB91_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst @@ -23065,11 +23065,12 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX12-LABEL: infer_as_before_atomic: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB92_2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12-NEXT: s_cbranch_scc0 .LBB92_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 @@ -23090,8 +23091,9 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB92_2 +; GFX940-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB92_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -23107,11 +23109,12 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX11-LABEL: infer_as_before_atomic: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB92_2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB92_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_bcnt1_i32_b32 s0, s0 @@ -23132,8 +23135,9 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB92_3 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB92_3 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_bcnt1_i32_b32 s3, s5 @@ -23153,8 +23157,9 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB92_2 +; GFX10-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB92_2 ; GFX10-NEXT: .LBB92_3: ; GFX10-NEXT: s_endpgm ; @@ -23164,8 +23169,9 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB92_2 +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB92_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -23184,8 +23190,9 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_cbranch_execz .LBB92_2 +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB92_2 ; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -23204,8 +23211,9 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB92_3 +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB92_3 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_bcnt1_i32_b64 s5, s[0:1] @@ -23226,9 +23234,10 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB92_2 +; GFX8-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX8-NEXT: s_cbranch_scc1 .LBB92_2 ; GFX8-NEXT: .LBB92_3: ; GFX8-NEXT: s_endpgm ; @@ -23238,8 +23247,9 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7-NEXT: s_cbranch_execz .LBB92_3 +; GFX7-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB92_3 ; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_bcnt1_i32_b64 s6, s[4:5] @@ -23262,9 +23272,10 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB92_2 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB92_2 ; GFX7-NEXT: .LBB92_3: ; GFX7-NEXT: s_endpgm ; @@ -23274,8 +23285,9 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB92_3 +; GFX6-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB92_3 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: s_bcnt1_i32_b64 s6, s[4:5] @@ -23299,9 +23311,10 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB92_2 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB92_2 ; GFX6-NEXT: .LBB92_3: ; GFX6-NEXT: s_endpgm %load = load ptr, ptr addrspace(4) %arg diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index 06d971febd0380..0607de77dd27d4 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -46,10 +46,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -90,10 +90,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -114,10 +114,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -138,10 +138,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -208,10 +208,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -252,10 +252,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -276,10 +276,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -302,10 +302,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -372,10 +372,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -416,10 +416,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -440,10 +440,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -466,10 +466,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -535,11 +535,11 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -578,11 +578,11 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -601,11 +601,11 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -624,11 +624,11 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -691,11 +691,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -734,11 +734,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -757,11 +757,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -782,11 +782,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -850,11 +850,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -893,11 +893,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -916,11 +916,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -941,11 +941,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1010,10 +1010,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1056,10 +1056,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1080,10 +1080,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1106,10 +1106,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1175,11 +1175,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1220,11 +1220,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1243,11 +1243,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1268,11 +1268,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1337,10 +1337,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1364,11 +1364,11 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1391,10 +1391,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1415,10 +1415,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1439,10 +1439,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1463,10 +1463,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,10 +1493,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1524,10 +1524,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1568,10 +1568,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1612,10 +1612,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1636,10 +1636,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1660,10 +1660,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1734,10 +1734,10 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1778,10 +1778,10 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1802,10 +1802,10 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1826,10 +1826,10 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1896,10 +1896,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1940,10 +1940,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1964,10 +1964,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1990,10 +1990,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2060,10 +2060,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2104,10 +2104,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2128,10 +2128,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2154,10 +2154,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2223,11 +2223,11 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2266,11 +2266,11 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2289,11 +2289,11 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2312,11 +2312,11 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2379,11 +2379,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2422,11 +2422,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2445,11 +2445,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2470,11 +2470,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2538,11 +2538,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2581,11 +2581,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2604,11 +2604,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2629,11 +2629,11 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2698,10 +2698,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2744,10 +2744,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2768,10 +2768,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2794,10 +2794,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2863,11 +2863,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2908,11 +2908,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2931,11 +2931,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2956,11 +2956,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -3021,11 +3021,11 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3058,11 +3058,11 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3102,10 +3102,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3128,10 +3128,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3193,11 +3193,11 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3230,11 +3230,11 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3274,10 +3274,10 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3302,10 +3302,10 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3366,11 +3366,11 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3403,11 +3403,11 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3447,10 +3447,10 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3475,10 +3475,10 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3539,11 +3539,11 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3575,11 +3575,11 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3617,11 +3617,11 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3641,11 +3641,11 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3701,11 +3701,11 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3737,11 +3737,11 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3779,11 +3779,11 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3805,11 +3805,11 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3866,11 +3866,11 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3902,11 +3902,11 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3944,11 +3944,11 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3970,11 +3970,11 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -4031,11 +4031,11 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4068,11 +4068,11 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4096,10 +4096,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4121,10 +4121,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4147,10 +4147,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4173,10 +4173,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4209,10 +4209,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -4244,10 +4244,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -4279,11 +4279,11 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,11 +4316,11 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4360,10 +4360,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4386,10 +4386,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4469,11 +4469,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4505,10 +4505,10 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4546,11 +4546,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4582,10 +4582,10 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4616,10 +4616,10 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4650,10 +4650,10 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4685,10 +4685,10 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4726,11 +4726,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4769,11 +4769,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -4822,11 +4822,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4860,10 +4860,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4902,11 +4902,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4939,10 +4939,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4974,10 +4974,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5009,10 +5009,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5045,10 +5045,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5087,11 +5087,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5132,11 +5132,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5186,11 +5186,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5225,10 +5225,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -5267,11 +5267,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5304,10 +5304,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5339,10 +5339,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5374,10 +5374,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5410,10 +5410,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5452,11 +5452,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5497,11 +5497,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5550,11 +5550,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5584,11 +5584,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5625,11 +5625,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5660,10 +5660,10 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5692,11 +5692,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5725,11 +5725,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5759,11 +5759,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5800,11 +5800,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5842,11 +5842,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -5893,11 +5893,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5929,11 +5929,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5971,11 +5971,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6007,10 +6007,10 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6040,11 +6040,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6074,11 +6074,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6109,11 +6109,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6151,11 +6151,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6194,11 +6194,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -6246,11 +6246,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6283,11 +6283,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6325,11 +6325,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6361,10 +6361,10 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6394,11 +6394,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6428,11 +6428,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6463,11 +6463,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6505,11 +6505,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6548,11 +6548,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 @@ -6588,11 +6588,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6616,10 +6616,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -6646,11 +6646,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6675,10 +6675,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6701,10 +6701,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6727,10 +6727,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6755,10 +6755,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6787,11 +6787,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6822,11 +6822,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -6863,11 +6863,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6889,11 +6889,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6919,11 +6919,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6947,10 +6947,10 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6971,11 +6971,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6996,11 +6996,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7023,11 +7023,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7056,11 +7056,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7090,11 +7090,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -7142,11 +7142,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7180,10 +7180,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7222,11 +7222,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7259,10 +7259,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7296,10 +7296,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7331,10 +7331,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -7367,10 +7367,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -7409,11 +7409,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7454,11 +7454,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -7508,11 +7508,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7544,11 +7544,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7586,11 +7586,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7622,10 +7622,10 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7657,11 +7657,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7691,11 +7691,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7726,11 +7726,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7768,11 +7768,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7811,11 +7811,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB35_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -7872,11 +7872,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7915,10 +7915,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7963,11 +7963,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8003,10 +8003,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8042,10 +8042,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8081,10 +8081,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -8121,10 +8121,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8163,11 +8163,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8207,11 +8207,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB36_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -8267,11 +8267,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -8312,10 +8312,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -8362,11 +8362,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8403,10 +8403,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8443,10 +8443,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8483,10 +8483,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -8524,10 +8524,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8567,11 +8567,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8613,11 +8613,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB37_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -8674,11 +8674,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -8720,10 +8720,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -8770,11 +8770,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8811,10 +8811,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8851,10 +8851,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8891,10 +8891,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -8932,10 +8932,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8975,11 +8975,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -9021,11 +9021,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB38_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -9079,11 +9079,11 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9120,11 +9120,11 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9167,11 +9167,11 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9206,10 +9206,10 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9243,11 +9243,11 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9281,11 +9281,11 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9320,11 +9320,11 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9362,11 +9362,11 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9405,11 +9405,11 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB39_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9462,11 +9462,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9505,11 +9505,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9554,11 +9554,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9594,10 +9594,10 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9632,11 +9632,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9671,11 +9671,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9711,11 +9711,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9754,11 +9754,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9798,11 +9798,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB40_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -9856,11 +9856,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9900,11 +9900,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9949,11 +9949,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9989,10 +9989,10 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10027,11 +10027,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10066,11 +10066,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10106,11 +10106,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10149,11 +10149,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10193,11 +10193,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB41_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 @@ -10240,11 +10240,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10276,10 +10276,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10314,11 +10314,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10348,10 +10348,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10381,10 +10381,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10414,10 +10414,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -10448,10 +10448,10 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10481,11 +10481,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -10517,11 +10517,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB42_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -10564,11 +10564,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10598,11 +10598,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10635,11 +10635,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10668,10 +10668,10 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10699,11 +10699,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10731,11 +10731,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10764,11 +10764,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10798,11 +10798,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10833,11 +10833,11 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB43_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -10892,11 +10892,11 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10937,10 +10937,10 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10987,11 +10987,11 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11028,10 +11028,10 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -11070,10 +11070,10 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11110,10 +11110,10 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -11151,10 +11151,10 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -11194,11 +11194,11 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -11240,11 +11240,11 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB44_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -11300,11 +11300,11 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11343,11 +11343,11 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11392,11 +11392,11 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11432,10 +11432,10 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11472,11 +11472,11 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11511,11 +11511,11 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11551,11 +11551,11 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11594,11 +11594,11 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11638,11 +11638,11 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB45_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -11679,11 +11679,11 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -11706,10 +11706,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -11733,11 +11733,11 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11760,10 +11760,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -11784,10 +11784,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11808,10 +11808,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -11836,10 +11836,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -11867,27 +11867,27 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -11913,31 +11913,31 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB46_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -11971,11 +11971,11 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -11998,10 +11998,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12025,11 +12025,11 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12052,10 +12052,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12076,10 +12076,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12100,10 +12100,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12130,10 +12130,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12160,27 +12160,27 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -12206,31 +12206,31 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -12265,11 +12265,11 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -12292,10 +12292,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12319,11 +12319,11 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12346,10 +12346,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12370,10 +12370,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12394,10 +12394,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12424,10 +12424,10 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12458,27 +12458,27 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12506,31 +12506,31 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 @@ -12563,11 +12563,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12588,11 +12588,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12615,11 +12615,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12641,10 +12641,10 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12663,11 +12663,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12686,11 +12686,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12713,11 +12713,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12744,27 +12744,27 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12788,31 +12788,31 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -12844,11 +12844,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12869,11 +12869,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12896,11 +12896,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12922,10 +12922,10 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12944,11 +12944,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12967,11 +12967,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12996,11 +12996,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13027,27 +13027,27 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13071,31 +13071,31 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -13128,11 +13128,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13153,11 +13153,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13180,11 +13180,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13206,10 +13206,10 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13228,11 +13228,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13251,11 +13251,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13280,11 +13280,11 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13315,27 +13315,27 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13363,31 +13363,31 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 @@ -13420,11 +13420,11 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -13447,10 +13447,10 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -13474,11 +13474,11 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -13501,10 +13501,10 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -13527,10 +13527,10 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -13551,10 +13551,10 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -13581,10 +13581,10 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13611,27 +13611,27 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -13657,31 +13657,31 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -13716,11 +13716,11 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13741,11 +13741,11 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13768,11 +13768,11 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13794,10 +13794,10 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13818,11 +13818,11 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13841,11 +13841,11 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13870,11 +13870,11 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13901,27 +13901,27 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13945,31 +13945,31 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -14024,11 +14024,11 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -14067,10 +14067,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -14114,12 +14114,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14156,10 +14156,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14196,10 +14196,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14236,10 +14236,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14277,10 +14277,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -14318,13 +14318,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -14364,13 +14364,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -14422,11 +14422,11 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -14465,10 +14465,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -14512,12 +14512,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14554,10 +14554,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14594,10 +14594,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14634,10 +14634,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14677,10 +14677,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14717,13 +14717,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -14763,13 +14763,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -14822,11 +14822,11 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -14865,10 +14865,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -14912,12 +14912,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14954,10 +14954,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14994,10 +14994,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15034,10 +15034,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -15077,10 +15077,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15121,13 +15121,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15169,13 +15169,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 @@ -15225,11 +15225,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15266,11 +15266,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15312,12 +15312,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15353,10 +15353,10 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15391,11 +15391,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15430,11 +15430,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15470,11 +15470,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15484,25 +15484,25 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -15511,13 +15511,13 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15527,26 +15527,26 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 @@ -15555,13 +15555,13 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15610,11 +15610,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15651,11 +15651,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB58_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15697,12 +15697,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15738,10 +15738,10 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB58_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15776,11 +15776,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15815,11 +15815,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB58_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15857,11 +15857,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB58_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15871,25 +15871,25 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -15898,13 +15898,13 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15914,26 +15914,26 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 @@ -15942,13 +15942,13 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB58_1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 @@ -15998,11 +15998,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16039,11 +16039,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB59_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16085,12 +16085,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16126,10 +16126,10 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB59_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16164,11 +16164,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16203,11 +16203,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB59_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16245,11 +16245,11 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB59_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16259,29 +16259,29 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -16290,13 +16290,13 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16306,30 +16306,30 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 @@ -16338,13 +16338,13 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB59_1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 @@ -16395,11 +16395,11 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -16438,10 +16438,10 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB60_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -16485,12 +16485,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -16527,10 +16527,10 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB60_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -16569,10 +16569,10 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -16609,10 +16609,10 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB60_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -16652,10 +16652,10 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB60_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16692,13 +16692,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -16738,13 +16738,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB60_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -16796,11 +16796,11 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16837,11 +16837,11 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB61_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16883,12 +16883,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16924,10 +16924,10 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB61_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16964,11 +16964,11 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17003,11 +17003,11 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB61_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17045,11 +17045,11 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB61_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17059,25 +17059,25 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -17086,13 +17086,13 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17102,26 +17102,26 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 @@ -17130,13 +17130,13 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB61_1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 65df8f07fb8b3b..2c9e223210a1e6 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -46,10 +46,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -90,10 +90,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -114,10 +114,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -138,10 +138,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -208,10 +208,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -252,10 +252,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -276,10 +276,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -302,10 +302,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -372,10 +372,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -416,10 +416,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -440,10 +440,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -466,10 +466,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -535,11 +535,11 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -578,11 +578,11 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -601,11 +601,11 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -624,11 +624,11 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -691,11 +691,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -734,11 +734,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -757,11 +757,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -782,11 +782,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -850,11 +850,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -893,11 +893,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -916,11 +916,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -941,11 +941,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1010,10 +1010,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1056,10 +1056,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1080,10 +1080,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1106,10 +1106,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1175,11 +1175,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1220,11 +1220,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1243,11 +1243,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1268,11 +1268,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1337,10 +1337,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1364,11 +1364,11 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1391,10 +1391,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1415,10 +1415,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1439,10 +1439,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1463,10 +1463,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,10 +1493,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1524,10 +1524,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1568,10 +1568,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1612,10 +1612,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1636,10 +1636,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1660,10 +1660,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1734,10 +1734,10 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1778,10 +1778,10 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1802,10 +1802,10 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1826,10 +1826,10 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1896,10 +1896,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1940,10 +1940,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1964,10 +1964,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1990,10 +1990,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2060,10 +2060,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2104,10 +2104,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2128,10 +2128,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2154,10 +2154,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2223,11 +2223,11 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2266,11 +2266,11 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2289,11 +2289,11 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2312,11 +2312,11 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -2379,11 +2379,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2422,11 +2422,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2445,11 +2445,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2470,11 +2470,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2538,11 +2538,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2581,11 +2581,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2604,11 +2604,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2629,11 +2629,11 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2698,10 +2698,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2744,10 +2744,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2768,10 +2768,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2794,10 +2794,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2863,11 +2863,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2908,11 +2908,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2931,11 +2931,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2956,11 +2956,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -3021,11 +3021,11 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3058,11 +3058,11 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3102,10 +3102,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3128,10 +3128,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3193,11 +3193,11 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3230,11 +3230,11 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3274,10 +3274,10 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3302,10 +3302,10 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3366,11 +3366,11 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3403,11 +3403,11 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3447,10 +3447,10 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3475,10 +3475,10 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3539,11 +3539,11 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3575,11 +3575,11 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3617,11 +3617,11 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3641,11 +3641,11 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3701,11 +3701,11 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3737,11 +3737,11 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3779,11 +3779,11 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3805,11 +3805,11 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3866,11 +3866,11 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3902,11 +3902,11 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3944,11 +3944,11 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3970,11 +3970,11 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -4031,11 +4031,11 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4068,11 +4068,11 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4096,10 +4096,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4121,10 +4121,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4147,10 +4147,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4173,10 +4173,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4209,10 +4209,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -4244,10 +4244,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -4279,11 +4279,11 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,11 +4316,11 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4360,10 +4360,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4386,10 +4386,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4469,11 +4469,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4505,10 +4505,10 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4546,11 +4546,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4582,10 +4582,10 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4616,10 +4616,10 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4650,10 +4650,10 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4685,10 +4685,10 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4726,11 +4726,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4769,11 +4769,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -4822,11 +4822,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4860,10 +4860,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4902,11 +4902,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4939,10 +4939,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4974,10 +4974,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5009,10 +5009,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5045,10 +5045,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5087,11 +5087,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5132,11 +5132,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5186,11 +5186,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5225,10 +5225,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -5267,11 +5267,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5304,10 +5304,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5339,10 +5339,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5374,10 +5374,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5410,10 +5410,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5452,11 +5452,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5497,11 +5497,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5550,11 +5550,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5584,11 +5584,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5625,11 +5625,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5660,10 +5660,10 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5692,11 +5692,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5725,11 +5725,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5759,11 +5759,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5800,11 +5800,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5842,11 +5842,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -5893,11 +5893,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5929,11 +5929,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5971,11 +5971,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6007,10 +6007,10 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6040,11 +6040,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6074,11 +6074,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6109,11 +6109,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6151,11 +6151,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6194,11 +6194,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -6246,11 +6246,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6283,11 +6283,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6325,11 +6325,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6361,10 +6361,10 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6394,11 +6394,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6428,11 +6428,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6463,11 +6463,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6505,11 +6505,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6548,11 +6548,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 @@ -6588,11 +6588,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6616,10 +6616,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -6646,11 +6646,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6675,10 +6675,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6701,10 +6701,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6727,10 +6727,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6755,10 +6755,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6787,11 +6787,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6822,11 +6822,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -6863,11 +6863,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6889,11 +6889,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6919,11 +6919,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6947,10 +6947,10 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6971,11 +6971,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -6996,11 +6996,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7023,11 +7023,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7056,11 +7056,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7090,11 +7090,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -7142,11 +7142,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7180,10 +7180,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7222,11 +7222,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7259,10 +7259,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7296,10 +7296,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7331,10 +7331,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -7367,10 +7367,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -7409,11 +7409,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -7454,11 +7454,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -7508,11 +7508,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7544,11 +7544,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7586,11 +7586,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7622,10 +7622,10 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7657,11 +7657,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7691,11 +7691,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7726,11 +7726,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7768,11 +7768,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7811,11 +7811,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB35_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -7872,11 +7872,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7915,10 +7915,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7963,11 +7963,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8003,10 +8003,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8042,10 +8042,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8081,10 +8081,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -8121,10 +8121,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8163,11 +8163,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8207,11 +8207,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB36_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -8267,11 +8267,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -8312,10 +8312,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -8362,11 +8362,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8403,10 +8403,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8443,10 +8443,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8483,10 +8483,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -8524,10 +8524,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8567,11 +8567,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8613,11 +8613,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB37_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -8674,11 +8674,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -8720,10 +8720,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -8770,11 +8770,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8811,10 +8811,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8851,10 +8851,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8891,10 +8891,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -8932,10 +8932,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8975,11 +8975,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -9021,11 +9021,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB38_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -9079,11 +9079,11 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9120,11 +9120,11 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9167,11 +9167,11 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9206,10 +9206,10 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9243,11 +9243,11 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9281,11 +9281,11 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9320,11 +9320,11 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9362,11 +9362,11 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9405,11 +9405,11 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB39_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9462,11 +9462,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9505,11 +9505,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9554,11 +9554,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9594,10 +9594,10 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9632,11 +9632,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9671,11 +9671,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9711,11 +9711,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9754,11 +9754,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9798,11 +9798,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB40_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -9856,11 +9856,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9900,11 +9900,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9949,11 +9949,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9989,10 +9989,10 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10027,11 +10027,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10066,11 +10066,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10106,11 +10106,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10149,11 +10149,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10193,11 +10193,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB41_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 @@ -10240,11 +10240,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10276,10 +10276,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10314,11 +10314,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10348,10 +10348,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10381,10 +10381,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10414,10 +10414,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -10448,10 +10448,10 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10481,11 +10481,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -10517,11 +10517,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB42_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -10564,11 +10564,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10598,11 +10598,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10635,11 +10635,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10668,10 +10668,10 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10699,11 +10699,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10731,11 +10731,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10764,11 +10764,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10798,11 +10798,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10833,11 +10833,11 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB43_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -10892,11 +10892,11 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10937,10 +10937,10 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10987,11 +10987,11 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11028,10 +11028,10 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -11070,10 +11070,10 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11110,10 +11110,10 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -11151,10 +11151,10 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -11194,11 +11194,11 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -11240,11 +11240,11 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB44_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -11300,11 +11300,11 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11343,11 +11343,11 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11392,11 +11392,11 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11432,10 +11432,10 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11472,11 +11472,11 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11511,11 +11511,11 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11551,11 +11551,11 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11594,11 +11594,11 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11638,11 +11638,11 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB45_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -11679,11 +11679,11 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -11706,10 +11706,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -11733,11 +11733,11 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11760,10 +11760,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -11784,10 +11784,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11808,10 +11808,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -11836,10 +11836,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -11867,27 +11867,27 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -11913,31 +11913,31 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB46_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -11971,11 +11971,11 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -11998,10 +11998,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12025,11 +12025,11 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12052,10 +12052,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12076,10 +12076,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12100,10 +12100,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12130,10 +12130,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12160,27 +12160,27 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -12206,31 +12206,31 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -12265,11 +12265,11 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -12292,10 +12292,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12319,11 +12319,11 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12346,10 +12346,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12370,10 +12370,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12394,10 +12394,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12424,10 +12424,10 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12458,27 +12458,27 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12506,31 +12506,31 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 @@ -12563,11 +12563,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12588,11 +12588,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12615,11 +12615,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12641,10 +12641,10 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12663,11 +12663,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12686,11 +12686,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12713,11 +12713,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12744,27 +12744,27 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12788,31 +12788,31 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -12844,11 +12844,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12869,11 +12869,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12896,11 +12896,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12922,10 +12922,10 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12944,11 +12944,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12967,11 +12967,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12996,11 +12996,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13027,27 +13027,27 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13071,31 +13071,31 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -13128,11 +13128,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13153,11 +13153,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13180,11 +13180,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13206,10 +13206,10 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13228,11 +13228,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13251,11 +13251,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13280,11 +13280,11 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13315,27 +13315,27 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13363,31 +13363,31 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 @@ -13420,11 +13420,11 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -13447,10 +13447,10 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -13474,11 +13474,11 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -13501,10 +13501,10 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -13527,10 +13527,10 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -13551,10 +13551,10 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -13581,10 +13581,10 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13611,27 +13611,27 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -13657,31 +13657,31 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -13716,11 +13716,11 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13741,11 +13741,11 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13768,11 +13768,11 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13794,10 +13794,10 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13818,11 +13818,11 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13841,11 +13841,11 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13870,11 +13870,11 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13901,27 +13901,27 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13945,31 +13945,31 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -14024,11 +14024,11 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -14067,10 +14067,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -14114,12 +14114,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14156,10 +14156,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14196,10 +14196,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14236,10 +14236,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14277,10 +14277,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -14318,13 +14318,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -14364,13 +14364,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -14422,11 +14422,11 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -14465,10 +14465,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -14512,12 +14512,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14554,10 +14554,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14594,10 +14594,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14634,10 +14634,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14677,10 +14677,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14717,13 +14717,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -14763,13 +14763,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -14822,11 +14822,11 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -14865,10 +14865,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -14912,12 +14912,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14954,10 +14954,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14994,10 +14994,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15034,10 +15034,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -15077,10 +15077,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15121,13 +15121,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15169,13 +15169,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 @@ -15225,11 +15225,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15266,11 +15266,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15312,12 +15312,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15353,10 +15353,10 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15391,11 +15391,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15430,11 +15430,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15470,11 +15470,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15484,25 +15484,25 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -15511,13 +15511,13 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15527,26 +15527,26 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 @@ -15555,13 +15555,13 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15610,11 +15610,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15651,11 +15651,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB58_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15697,12 +15697,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15738,10 +15738,10 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB58_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15776,11 +15776,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15815,11 +15815,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB58_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15857,11 +15857,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB58_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15871,25 +15871,25 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -15898,13 +15898,13 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15914,26 +15914,26 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 @@ -15942,13 +15942,13 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB58_1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 @@ -15998,11 +15998,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16039,11 +16039,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB59_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16085,12 +16085,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16126,10 +16126,10 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB59_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16164,11 +16164,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16203,11 +16203,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB59_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16245,11 +16245,11 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB59_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16259,29 +16259,29 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -16290,13 +16290,13 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16306,30 +16306,30 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 @@ -16338,13 +16338,13 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB59_1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 @@ -16395,11 +16395,11 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -16438,10 +16438,10 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB60_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -16485,12 +16485,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -16527,10 +16527,10 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB60_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -16569,10 +16569,10 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -16609,10 +16609,10 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB60_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -16652,10 +16652,10 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB60_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16692,13 +16692,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -16738,13 +16738,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB60_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -16796,11 +16796,11 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16837,11 +16837,11 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB61_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16883,12 +16883,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16924,10 +16924,10 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB61_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16964,11 +16964,11 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17003,11 +17003,11 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB61_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17045,11 +17045,11 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB61_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17059,25 +17059,25 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -17086,13 +17086,13 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17102,26 +17102,26 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 @@ -17130,13 +17130,13 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB61_1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 79aa69771f84bd..1a9f82d686576f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -36,11 +36,11 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -60,10 +60,10 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -85,11 +85,11 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -110,10 +110,10 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -132,10 +132,10 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -154,10 +154,10 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -176,10 +176,10 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -204,10 +204,10 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -233,10 +233,10 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB0_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -267,11 +267,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB1_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -291,10 +291,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -316,11 +316,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB1_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -341,10 +341,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -363,10 +363,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -385,10 +385,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -409,10 +409,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos: @@ -436,10 +436,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -465,10 +465,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB1_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -500,11 +500,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB2_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -524,10 +524,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -549,11 +549,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB2_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -574,10 +574,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -596,10 +596,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -618,10 +618,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -642,10 +642,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -675,10 +675,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -709,10 +709,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB2_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 @@ -742,11 +742,11 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32: @@ -764,11 +764,11 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_f32: @@ -788,11 +788,11 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f32: @@ -812,10 +812,10 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_f32: @@ -832,11 +832,11 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_f32: @@ -853,11 +853,11 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_f32: @@ -874,11 +874,11 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_f32: @@ -901,11 +901,11 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_f32: @@ -929,11 +929,11 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB3_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst @@ -962,11 +962,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -984,11 +984,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1008,11 +1008,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1032,10 +1032,10 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1052,11 +1052,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1073,11 +1073,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1096,11 +1096,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1123,11 +1123,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1151,11 +1151,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB4_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -1185,11 +1185,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1207,11 +1207,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1231,11 +1231,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1255,10 +1255,10 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1275,11 +1275,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1296,11 +1296,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1319,11 +1319,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1350,11 +1350,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1382,11 +1382,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 @@ -1417,11 +1417,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -1441,10 +1441,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1466,11 +1466,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1491,10 +1491,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1515,10 +1515,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1537,10 +1537,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1561,10 +1561,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos: @@ -1588,10 +1588,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1617,10 +1617,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1651,11 +1651,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1673,11 +1673,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1697,11 +1697,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1721,10 +1721,10 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1743,11 +1743,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1764,11 +1764,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1787,11 +1787,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1814,11 +1814,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1842,11 +1842,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -1881,11 +1881,11 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -1905,10 +1905,10 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1930,11 +1930,11 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1955,10 +1955,10 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1977,10 +1977,10 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1999,10 +1999,10 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2021,10 +2021,10 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2049,10 +2049,10 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2078,10 +2078,10 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2112,11 +2112,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2136,10 +2136,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2161,11 +2161,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2186,10 +2186,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2208,10 +2208,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2230,10 +2230,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2254,10 +2254,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -2281,10 +2281,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2310,10 +2310,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2345,11 +2345,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2369,10 +2369,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2394,11 +2394,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2419,10 +2419,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2441,10 +2441,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2463,10 +2463,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2487,10 +2487,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2520,10 +2520,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2554,10 +2554,10 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 @@ -2587,11 +2587,11 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__ftz: @@ -2609,11 +2609,11 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_f32__ftz: @@ -2633,11 +2633,11 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f32__ftz: @@ -2657,10 +2657,10 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_f32__ftz: @@ -2677,11 +2677,11 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_f32__ftz: @@ -2698,11 +2698,11 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_f32__ftz: @@ -2719,11 +2719,11 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_f32__ftz: @@ -2746,11 +2746,11 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_f32__ftz: @@ -2774,11 +2774,11 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst @@ -2807,11 +2807,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2829,11 +2829,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2853,11 +2853,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2877,10 +2877,10 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2897,11 +2897,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2918,11 +2918,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2941,11 +2941,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2968,11 +2968,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2996,11 +2996,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -3030,11 +3030,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -3052,11 +3052,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -3076,11 +3076,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -3100,10 +3100,10 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -3120,11 +3120,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -3141,11 +3141,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -3164,11 +3164,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -3195,11 +3195,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -3227,11 +3227,11 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 @@ -3262,11 +3262,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3286,10 +3286,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -3311,11 +3311,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3336,10 +3336,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3360,10 +3360,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3382,10 +3382,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -3406,10 +3406,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -3433,10 +3433,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -3462,10 +3462,10 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3496,11 +3496,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3518,11 +3518,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3542,11 +3542,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3566,10 +3566,10 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3588,11 +3588,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3609,11 +3609,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3632,11 +3632,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3659,11 +3659,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3687,11 +3687,11 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 @@ -3726,11 +3726,11 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3750,10 +3750,10 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3776,11 +3776,11 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3802,10 +3802,10 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3825,10 +3825,10 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3849,10 +3849,10 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3873,10 +3873,10 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3909,10 +3909,10 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_f64: @@ -3944,10 +3944,10 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst @@ -3977,11 +3977,11 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4001,10 +4001,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4027,11 +4027,11 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4053,10 +4053,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4076,10 +4076,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4100,10 +4100,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4126,10 +4126,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos: @@ -4160,10 +4160,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos: @@ -4195,10 +4195,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 @@ -4229,11 +4229,11 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4253,10 +4253,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4279,11 +4279,11 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4305,10 +4305,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4328,10 +4328,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4352,10 +4352,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4378,10 +4378,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -4416,10 +4416,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -4455,10 +4455,10 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 @@ -4488,11 +4488,11 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64: @@ -4510,11 +4510,11 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_f64: @@ -4534,11 +4534,11 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f64: @@ -4559,10 +4559,10 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_f64: @@ -4579,11 +4579,11 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_f64: @@ -4601,11 +4601,11 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_f64: @@ -4623,11 +4623,11 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_f64: @@ -4653,11 +4653,11 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v7, v9 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_f64: @@ -4684,11 +4684,11 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v7, v9 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB19_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst @@ -4717,11 +4717,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4739,11 +4739,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4763,11 +4763,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4788,10 +4788,10 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4808,11 +4808,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4830,11 +4830,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4854,11 +4854,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4884,11 +4884,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v7, v9 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4915,11 +4915,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v7, v9 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB20_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 @@ -4949,11 +4949,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -4971,11 +4971,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -4995,11 +4995,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -5020,10 +5020,10 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -5040,11 +5040,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -5062,11 +5062,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -5086,11 +5086,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -5120,11 +5120,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v7, v9 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -5155,11 +5155,11 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v7, v9 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB21_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 @@ -5208,11 +5208,11 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5221,32 +5221,32 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: global_load_dword v5, v[0:1], off ; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_ret_f16: @@ -5281,11 +5281,11 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5315,10 +5315,10 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5327,31 +5327,31 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_ret_f16: @@ -5359,31 +5359,31 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v5, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_ret_f16: @@ -5412,10 +5412,10 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5453,11 +5453,11 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5496,11 +5496,11 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5547,11 +5547,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5559,35 +5559,35 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -5623,11 +5623,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5658,10 +5658,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5671,31 +5671,31 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -5704,31 +5704,31 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v5, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -5758,10 +5758,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5800,11 +5800,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5845,11 +5845,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5897,11 +5897,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5910,35 +5910,35 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -5974,11 +5974,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6009,10 +6009,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6022,31 +6022,31 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -6055,31 +6055,31 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v5, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -6109,10 +6109,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6151,11 +6151,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -6196,11 +6196,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -6246,11 +6246,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16: @@ -6278,11 +6278,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_f16: @@ -6316,11 +6316,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f16: @@ -6349,10 +6349,10 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_f16: @@ -6379,11 +6379,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_f16: @@ -6410,11 +6410,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_f16: @@ -6442,11 +6442,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_f16: @@ -6483,11 +6483,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_f16: @@ -6525,11 +6525,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst @@ -6573,11 +6573,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -6607,11 +6607,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -6646,11 +6646,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -6680,10 +6680,10 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -6711,11 +6711,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -6743,11 +6743,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -6776,11 +6776,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -6818,11 +6818,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -6861,11 +6861,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -6910,11 +6910,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -6945,11 +6945,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -6984,11 +6984,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7018,10 +7018,10 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7049,11 +7049,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7081,11 +7081,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7114,11 +7114,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7156,11 +7156,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7199,11 +7199,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 @@ -7237,11 +7237,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7263,10 +7263,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7291,11 +7291,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7318,10 +7318,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7342,10 +7342,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7366,10 +7366,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -7392,10 +7392,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -7424,11 +7424,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -7459,11 +7459,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -7497,11 +7497,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -7521,11 +7521,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -7548,11 +7548,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -7574,10 +7574,10 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -7596,11 +7596,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -7619,11 +7619,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -7644,11 +7644,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -7677,11 +7677,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -7711,11 +7711,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -7761,11 +7761,11 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7773,35 +7773,35 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX940-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: @@ -7837,11 +7837,11 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7872,10 +7872,10 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7885,33 +7885,33 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX90A-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: @@ -7920,31 +7920,31 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v5, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX908-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: @@ -7974,10 +7974,10 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8016,11 +8016,11 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8061,11 +8061,11 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -8112,11 +8112,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8146,11 +8146,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8185,11 +8185,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8219,10 +8219,10 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8252,11 +8252,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8284,11 +8284,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8317,11 +8317,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8359,11 +8359,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8402,11 +8402,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -8463,11 +8463,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -8506,10 +8506,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -8554,11 +8554,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8594,10 +8594,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8633,10 +8633,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8672,10 +8672,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -8712,10 +8712,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8753,11 +8753,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8796,11 +8796,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -8856,11 +8856,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -8901,10 +8901,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -8951,11 +8951,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -8992,10 +8992,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9032,10 +9032,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9072,10 +9072,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -9113,10 +9113,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -9155,11 +9155,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -9200,11 +9200,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -9261,11 +9261,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -9307,10 +9307,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -9357,11 +9357,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -9398,10 +9398,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -9438,10 +9438,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -9478,10 +9478,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -9519,10 +9519,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -9561,11 +9561,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -9606,11 +9606,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -9664,11 +9664,11 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16: @@ -9705,11 +9705,11 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_bf16: @@ -9752,11 +9752,11 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_bf16: @@ -9791,10 +9791,10 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_bf16: @@ -9828,11 +9828,11 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_bf16: @@ -9866,11 +9866,11 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_bf16: @@ -9905,11 +9905,11 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_bf16: @@ -9946,11 +9946,11 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_bf16: @@ -9988,11 +9988,11 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB35_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst @@ -10045,11 +10045,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -10088,11 +10088,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -10137,11 +10137,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -10177,10 +10177,10 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -10215,11 +10215,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -10254,11 +10254,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -10294,11 +10294,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -10336,11 +10336,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -10379,11 +10379,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB36_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -10437,11 +10437,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -10481,11 +10481,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -10530,11 +10530,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -10570,10 +10570,10 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -10608,11 +10608,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -10647,11 +10647,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -10687,11 +10687,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -10729,11 +10729,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -10772,11 +10772,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB37_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 @@ -10819,11 +10819,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -10855,10 +10855,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -10893,11 +10893,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -10927,10 +10927,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10960,10 +10960,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -10993,10 +10993,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -11027,10 +11027,10 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -11059,11 +11059,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -11094,11 +11094,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB38_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -11141,11 +11141,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -11175,11 +11175,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -11212,11 +11212,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -11245,10 +11245,10 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -11276,11 +11276,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -11308,11 +11308,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -11341,11 +11341,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -11374,11 +11374,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -11408,11 +11408,11 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB39_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -11467,11 +11467,11 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -11512,10 +11512,10 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -11562,11 +11562,11 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -11603,10 +11603,10 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -11645,10 +11645,10 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -11685,10 +11685,10 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -11726,10 +11726,10 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -11768,11 +11768,11 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -11813,11 +11813,11 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB40_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -11873,11 +11873,11 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -11916,11 +11916,11 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -11965,11 +11965,11 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -12005,10 +12005,10 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -12045,11 +12045,11 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -12084,11 +12084,11 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -12124,11 +12124,11 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -12166,11 +12166,11 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -12209,11 +12209,11 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB41_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -12248,11 +12248,11 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -12272,10 +12272,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12297,11 +12297,11 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12322,10 +12322,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12344,10 +12344,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12366,10 +12366,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12390,10 +12390,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -12421,27 +12421,27 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -12467,31 +12467,31 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB42_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -12523,11 +12523,11 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -12547,10 +12547,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12572,11 +12572,11 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12597,10 +12597,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12619,10 +12619,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12641,10 +12641,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12667,10 +12667,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -12697,27 +12697,27 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -12743,31 +12743,31 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB43_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -12800,11 +12800,11 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -12824,10 +12824,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -12849,11 +12849,11 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -12874,10 +12874,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12896,10 +12896,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12918,10 +12918,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -12944,10 +12944,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -12978,27 +12978,27 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -13026,31 +13026,31 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB44_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 @@ -13080,11 +13080,11 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16: @@ -13102,11 +13102,11 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2f16: @@ -13126,11 +13126,11 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2f16: @@ -13150,10 +13150,10 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2f16: @@ -13170,11 +13170,11 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2f16: @@ -13191,11 +13191,11 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_v2f16: @@ -13214,11 +13214,11 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_v2f16: @@ -13245,27 +13245,27 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_v2f16: @@ -13289,31 +13289,31 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB45_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst @@ -13342,11 +13342,11 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13364,11 +13364,11 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13388,11 +13388,11 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13412,10 +13412,10 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13432,11 +13432,11 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13453,11 +13453,11 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13478,11 +13478,11 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13509,27 +13509,27 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13553,31 +13553,31 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB46_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -13607,11 +13607,11 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -13629,11 +13629,11 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -13653,11 +13653,11 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -13677,10 +13677,10 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -13697,11 +13697,11 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -13718,11 +13718,11 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -13743,11 +13743,11 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -13778,27 +13778,27 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -13826,31 +13826,31 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 @@ -13881,11 +13881,11 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -13905,10 +13905,10 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -13930,11 +13930,11 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -13955,10 +13955,10 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -13979,10 +13979,10 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14001,10 +14001,10 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14027,10 +14027,10 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -14057,27 +14057,27 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -14103,31 +14103,31 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -14159,11 +14159,11 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -14181,11 +14181,11 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -14205,11 +14205,11 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -14229,10 +14229,10 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -14251,11 +14251,11 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -14272,11 +14272,11 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -14297,11 +14297,11 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -14328,27 +14328,27 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -14372,31 +14372,31 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -14451,11 +14451,11 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -14494,10 +14494,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -14541,12 +14541,12 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14583,10 +14583,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14623,10 +14623,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14663,10 +14663,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -14704,10 +14704,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -14745,13 +14745,13 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -14791,13 +14791,13 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -14849,11 +14849,11 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -14892,10 +14892,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -14939,12 +14939,12 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -14981,10 +14981,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -15021,10 +15021,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15061,10 +15061,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -15104,10 +15104,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -15144,13 +15144,13 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -15190,13 +15190,13 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -15249,11 +15249,11 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -15292,10 +15292,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -15339,12 +15339,12 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -15381,10 +15381,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -15421,10 +15421,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15461,10 +15461,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -15504,10 +15504,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -15548,13 +15548,13 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -15596,13 +15596,13 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 @@ -15652,11 +15652,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16: @@ -15693,11 +15693,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16: @@ -15739,12 +15739,12 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16: @@ -15780,10 +15780,10 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16: @@ -15818,11 +15818,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16: @@ -15857,11 +15857,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_v2bf16: @@ -15897,11 +15897,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_v2bf16: @@ -15938,13 +15938,13 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_v2bf16: @@ -15982,13 +15982,13 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst @@ -16037,11 +16037,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -16078,11 +16078,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -16124,12 +16124,12 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -16165,10 +16165,10 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -16203,11 +16203,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -16242,11 +16242,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -16284,11 +16284,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -16325,13 +16325,13 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -16369,13 +16369,13 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 @@ -16425,11 +16425,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -16466,11 +16466,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -16512,12 +16512,12 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -16553,10 +16553,10 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -16591,11 +16591,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -16630,11 +16630,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -16672,11 +16672,11 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -16717,13 +16717,13 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -16765,13 +16765,13 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 @@ -16822,11 +16822,11 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -16865,10 +16865,10 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -16912,12 +16912,12 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -16954,10 +16954,10 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -16996,10 +16996,10 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17036,10 +17036,10 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -17079,10 +17079,10 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -17119,13 +17119,13 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -17165,13 +17165,13 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -17223,11 +17223,11 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -17264,11 +17264,11 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -17310,12 +17310,12 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -17351,10 +17351,10 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -17391,11 +17391,11 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -17430,11 +17430,11 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -17472,11 +17472,11 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -17513,13 +17513,13 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -17557,13 +17557,13 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll index 6b4a6381d954cb..1888fac8add0c4 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -28,10 +28,10 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_max_saddr_i32_rtn: @@ -52,10 +52,10 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_max_saddr_i32_rtn: @@ -79,11 +79,11 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -111,10 +111,10 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_max_saddr_i32_rtn_neg128: @@ -135,10 +135,10 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_max_saddr_i32_rtn_neg128: @@ -162,11 +162,11 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB1_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -193,9 +193,10 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -216,8 +217,9 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -240,9 +242,10 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB2_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -268,9 +271,10 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB3_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -291,8 +295,9 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -315,9 +320,10 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -348,10 +354,10 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -376,10 +382,10 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -407,11 +413,11 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -443,10 +449,10 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -471,10 +477,10 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -502,11 +508,11 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -538,9 +544,10 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -564,8 +571,9 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -591,9 +599,10 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -622,9 +631,10 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -648,8 +658,9 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -675,9 +686,10 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -710,10 +722,10 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB8_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_min_saddr_i32_rtn: @@ -734,10 +746,10 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_min_saddr_i32_rtn: @@ -761,11 +773,11 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -793,10 +805,10 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_min_saddr_i32_rtn_neg128: @@ -817,10 +829,10 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_min_saddr_i32_rtn_neg128: @@ -844,11 +856,11 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -875,9 +887,10 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -898,8 +911,9 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -922,9 +936,10 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -950,9 +965,10 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -973,8 +989,9 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -997,9 +1014,10 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1030,10 +1048,10 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -1058,10 +1076,10 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -1089,11 +1107,11 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -1125,10 +1143,10 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB13_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -1153,10 +1171,10 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -1184,11 +1202,11 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -1220,9 +1238,10 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -1246,8 +1265,9 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -1273,9 +1293,10 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1304,9 +1325,10 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -1330,8 +1352,9 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -1357,9 +1380,10 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1392,10 +1416,10 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB16_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umax_saddr_i32_rtn: @@ -1416,10 +1440,10 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umax_saddr_i32_rtn: @@ -1443,11 +1467,11 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1475,10 +1499,10 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128: @@ -1499,10 +1523,10 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128: @@ -1526,11 +1550,11 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1557,9 +1581,10 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB18_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -1580,8 +1605,9 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -1604,9 +1630,10 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1632,9 +1659,10 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB19_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -1655,8 +1683,9 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -1679,9 +1708,10 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1712,10 +1742,10 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB20_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -1740,10 +1770,10 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -1771,11 +1801,11 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -1807,10 +1837,10 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB21_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -1835,10 +1865,10 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -1866,11 +1896,11 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -1902,9 +1932,10 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB22_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -1928,8 +1959,9 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -1955,9 +1987,10 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1986,9 +2019,10 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB23_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -2012,8 +2046,9 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -2039,9 +2074,10 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -2074,10 +2110,10 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB24_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umin_saddr_i32_rtn: @@ -2098,10 +2134,10 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umin_saddr_i32_rtn: @@ -2125,11 +2161,11 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2157,10 +2193,10 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB25_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128: @@ -2181,10 +2217,10 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128: @@ -2208,11 +2244,11 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2239,9 +2275,10 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB26_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -2262,8 +2299,9 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -2286,9 +2324,10 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -2314,9 +2353,10 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB27_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -2337,8 +2377,9 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -2361,9 +2402,10 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -2394,10 +2436,10 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB28_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -2422,10 +2464,10 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -2453,11 +2495,11 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -2489,10 +2531,10 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB29_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -2517,10 +2559,10 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -2548,11 +2590,11 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -2584,9 +2626,10 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -2610,8 +2653,9 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -2637,9 +2681,10 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -2668,9 +2713,10 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB31_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -2694,8 +2740,9 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -2721,9 +2768,10 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index c7fa2a2ede3887..d16417ea2da778 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -2182,11 +2182,11 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB51_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB51_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2205,11 +2205,11 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB51_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB51_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_noret: @@ -2227,11 +2227,11 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB51_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -2260,11 +2260,11 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB52_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB52_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2285,11 +2285,11 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB52_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB52_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_noret_offset: @@ -2307,11 +2307,11 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB52_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst @@ -2342,10 +2342,10 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB53_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB53_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2366,10 +2366,10 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB53_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2389,10 +2389,10 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst @@ -2423,10 +2423,10 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB54_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB54_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2449,10 +2449,10 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB54_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB54_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_ret_offset: @@ -2471,10 +2471,10 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB54_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -2510,11 +2510,11 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB55_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB55_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2540,11 +2540,11 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB55_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_noret_scalar: @@ -2563,11 +2563,11 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB55_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -2601,11 +2601,11 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB56_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB56_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2633,11 +2633,11 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB56_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB56_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_noret_offset_scalar: @@ -2656,11 +2656,11 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB56_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst @@ -2696,10 +2696,10 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB57_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB57_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2728,10 +2728,10 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB57_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB57_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_ret_scalar: @@ -2751,10 +2751,10 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB57_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -2789,10 +2789,10 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB58_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB58_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2821,10 +2821,10 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB58_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB58_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_ret_offset_scalar: @@ -2844,10 +2844,10 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB58_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst @@ -2877,11 +2877,11 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB59_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB59_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2902,11 +2902,11 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB59_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB59_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory: @@ -2924,11 +2924,11 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2959,10 +2959,10 @@ define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB60_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB60_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2985,10 +2985,10 @@ define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB60_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB60_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory: @@ -3007,10 +3007,10 @@ define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB60_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -3964,11 +3964,11 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB83_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB83_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3986,11 +3986,11 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB83_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB83_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret: @@ -4007,11 +4007,11 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB83_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB83_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -4039,11 +4039,11 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB84_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB84_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4063,11 +4063,11 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB84_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB84_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret_offset: @@ -4084,11 +4084,11 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB84_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB84_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst @@ -4118,10 +4118,10 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB85_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4141,10 +4141,10 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB85_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4163,10 +4163,10 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB85_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB85_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst @@ -4196,10 +4196,10 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB86_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB86_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4221,10 +4221,10 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB86_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB86_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_ret_offset: @@ -4242,10 +4242,10 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB86_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB86_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4280,11 +4280,11 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB87_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB87_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4309,11 +4309,11 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB87_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret_scalar: @@ -4331,11 +4331,11 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB87_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB87_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -4368,11 +4368,11 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB88_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB88_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4399,11 +4399,11 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB88_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB88_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret_offset_scalar: @@ -4421,11 +4421,11 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB88_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB88_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst @@ -4460,10 +4460,10 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB89_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB89_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4491,10 +4491,10 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB89_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB89_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_ret_scalar: @@ -4513,10 +4513,10 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB89_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -4550,10 +4550,10 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB90_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB90_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4581,10 +4581,10 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB90_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_ret_offset_scalar: @@ -4603,10 +4603,10 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB90_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB90_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst @@ -4640,9 +4640,10 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB91_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB91_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -4671,9 +4672,10 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB91_1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4699,9 +4701,10 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB91_1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -4739,11 +4742,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB92_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB92_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -4778,10 +4781,10 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB92_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB92_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -4811,10 +4814,10 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB92_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB92_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm @@ -4853,9 +4856,10 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB93_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB93_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -4882,9 +4886,10 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB93_1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB93_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4910,9 +4915,10 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB93_1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB93_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -4949,11 +4955,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB94_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB94_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -4986,10 +4992,10 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB94_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB94_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -5019,10 +5025,10 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB94_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB94_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm @@ -5055,11 +5061,11 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB95_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB95_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5079,11 +5085,11 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB95_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB95_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory: @@ -5100,11 +5106,11 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB95_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB95_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -5134,10 +5140,10 @@ define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB96_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB96_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5159,10 +5165,10 @@ define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB96_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB96_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: @@ -5180,10 +5186,10 @@ define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB96_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB96_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -5217,11 +5223,11 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB97_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB97_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5239,11 +5245,11 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB97_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB97_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret: @@ -5260,11 +5266,11 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB97_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB97_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -5292,11 +5298,11 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB98_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB98_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5316,11 +5322,11 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB98_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB98_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret_offset: @@ -5337,11 +5343,11 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB98_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB98_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst @@ -5371,10 +5377,10 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB99_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5394,10 +5400,10 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB99_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB99_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5416,10 +5422,10 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB99_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB99_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst @@ -5449,10 +5455,10 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB100_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB100_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5474,10 +5480,10 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB100_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB100_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_ret_offset: @@ -5495,10 +5501,10 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB100_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB100_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5533,11 +5539,11 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB101_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -5562,11 +5568,11 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB101_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret_scalar: @@ -5584,11 +5590,11 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB101_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB101_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -5621,11 +5627,11 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB102_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -5652,11 +5658,11 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB102_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB102_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret_offset_scalar: @@ -5674,11 +5680,11 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB102_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB102_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst @@ -5713,10 +5719,10 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB103_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB103_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -5744,10 +5750,10 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB103_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB103_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_ret_scalar: @@ -5766,10 +5772,10 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB103_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -5803,10 +5809,10 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB104_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB104_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -5834,10 +5840,10 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB104_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_ret_offset_scalar: @@ -5856,10 +5862,10 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB104_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB104_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst @@ -5893,9 +5899,10 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB105_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB105_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -5924,9 +5931,10 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB105_1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB105_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -5952,9 +5960,10 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB105_1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB105_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -5992,11 +6001,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB106_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB106_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -6031,10 +6040,10 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB106_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB106_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -6064,10 +6073,10 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB106_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB106_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm @@ -6107,11 +6116,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB107_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB107_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -6144,10 +6153,10 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB107_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB107_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -6177,10 +6186,10 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB107_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB107_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm @@ -6213,11 +6222,11 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB108_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB108_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6237,11 +6246,11 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB108_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB108_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory: @@ -6258,11 +6267,11 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB108_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6292,10 +6301,10 @@ define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB109_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB109_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6317,10 +6326,10 @@ define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB109_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB109_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: @@ -6338,10 +6347,10 @@ define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB109_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -6375,11 +6384,11 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB110_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB110_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6397,11 +6406,11 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB110_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB110_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret: @@ -6418,11 +6427,11 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB110_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB110_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -6450,11 +6459,11 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB111_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB111_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6474,11 +6483,11 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB111_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB111_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret_offset: @@ -6495,11 +6504,11 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB111_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB111_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst @@ -6529,10 +6538,10 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB112_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB112_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6552,10 +6561,10 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB112_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB112_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6574,10 +6583,10 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB112_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB112_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst @@ -6607,10 +6616,10 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB113_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB113_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6632,10 +6641,10 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB113_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB113_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_ret_offset: @@ -6653,10 +6662,10 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB113_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB113_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6691,11 +6700,11 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB114_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB114_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6720,11 +6729,11 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB114_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret_scalar: @@ -6742,11 +6751,11 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB114_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB114_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -6779,11 +6788,11 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB115_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB115_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6810,11 +6819,11 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB115_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB115_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret_offset_scalar: @@ -6832,11 +6841,11 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB115_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB115_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst @@ -6871,10 +6880,10 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB116_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB116_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6902,10 +6911,10 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB116_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB116_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_ret_scalar: @@ -6924,10 +6933,10 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB116_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB116_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -6961,10 +6970,10 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB117_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB117_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6992,10 +7001,10 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB117_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB117_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar: @@ -7014,10 +7023,10 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB117_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB117_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst @@ -7046,11 +7055,11 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB118_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB118_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7070,11 +7079,11 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB118_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB118_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory: @@ -7091,11 +7100,11 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB118_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB118_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7125,10 +7134,10 @@ define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB119_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB119_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7150,10 +7159,10 @@ define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB119_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB119_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: @@ -7171,10 +7180,10 @@ define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB119_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB119_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -7208,11 +7217,11 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB120_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB120_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7230,11 +7239,11 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB120_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB120_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret: @@ -7251,11 +7260,11 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB120_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB120_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -7283,11 +7292,11 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB121_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB121_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7307,11 +7316,11 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB121_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB121_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret_offset: @@ -7328,11 +7337,11 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB121_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB121_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst @@ -7362,10 +7371,10 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB122_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB122_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7385,10 +7394,10 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB122_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB122_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7407,10 +7416,10 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB122_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB122_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst @@ -7440,10 +7449,10 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB123_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB123_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7465,10 +7474,10 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB123_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB123_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_ret_offset: @@ -7486,10 +7495,10 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB123_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB123_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7524,11 +7533,11 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB124_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB124_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7553,11 +7562,11 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB124_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret_scalar: @@ -7575,11 +7584,11 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB124_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB124_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -7612,11 +7621,11 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB125_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB125_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7643,11 +7652,11 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB125_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB125_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret_offset_scalar: @@ -7665,11 +7674,11 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB125_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB125_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst @@ -7704,10 +7713,10 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB126_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB126_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7735,10 +7744,10 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB126_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB126_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_ret_scalar: @@ -7757,10 +7766,10 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB126_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -7794,10 +7803,10 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB127_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB127_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7825,10 +7834,10 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB127_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB127_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_ret_offset_scalar: @@ -7847,10 +7856,10 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB127_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB127_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst @@ -7884,9 +7893,10 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB128_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB128_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -7915,9 +7925,10 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB128_1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -7943,9 +7954,10 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB128_1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -7983,11 +7995,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB129_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB129_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -8022,10 +8034,10 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB129_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB129_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -8055,10 +8067,10 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB129_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB129_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm @@ -8093,9 +8105,10 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB130_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; SI-NEXT: s_cbranch_scc1 .LBB130_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -8118,9 +8131,10 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB130_1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB130_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -8142,9 +8156,10 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB130_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB130_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -8180,11 +8195,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB131_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB131_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -8217,10 +8232,10 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB131_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB131_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -8250,10 +8265,10 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB131_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB131_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm @@ -8286,11 +8301,11 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB132_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB132_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8310,11 +8325,11 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB132_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB132_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory: @@ -8331,11 +8346,11 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB132_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB132_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -8365,10 +8380,10 @@ define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB133_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB133_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8390,10 +8405,10 @@ define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB133_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB133_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: @@ -8411,10 +8426,10 @@ define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB133_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB133_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 9af7e0978f9dbe..42334d4e15d977 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -2224,12 +2224,12 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB50_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB50_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2251,11 +2251,11 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB50_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB50_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_noret: @@ -2276,11 +2276,11 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB50_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -2313,12 +2313,12 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB51_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB51_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2342,11 +2342,11 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB51_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB51_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_noret_offset: @@ -2367,11 +2367,11 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB51_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst @@ -2411,10 +2411,10 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB52_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB52_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2437,10 +2437,10 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB52_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB52_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2464,10 +2464,10 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB52_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2508,10 +2508,10 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB53_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB53_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2536,10 +2536,10 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB53_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_ret_offset: @@ -2561,10 +2561,10 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2606,12 +2606,12 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB54_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB54_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v8, 1 ; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2642,11 +2642,11 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB54_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB54_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_noret_scalar: @@ -2668,11 +2668,11 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB54_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -2711,12 +2711,12 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB55_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB55_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v8, 1 ; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2747,11 +2747,11 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB55_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_noret_offset_scalar: @@ -2773,11 +2773,11 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB55_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst @@ -2819,10 +2819,10 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB56_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB56_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v6, 1 ; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2854,10 +2854,10 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB56_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB56_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_ret_scalar: @@ -2880,10 +2880,10 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB56_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -2924,10 +2924,10 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB57_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB57_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v6, 1 ; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2959,10 +2959,10 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB57_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB57_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_ret_offset_scalar: @@ -2985,10 +2985,10 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB57_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst @@ -3022,12 +3022,12 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB58_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB58_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3051,11 +3051,11 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB58_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB58_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: @@ -3076,11 +3076,11 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB58_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3120,10 +3120,10 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB59_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB59_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3148,10 +3148,10 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB59_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB59_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: @@ -3173,10 +3173,10 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4091,12 +4091,12 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB80_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB80_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4117,11 +4117,11 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB80_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB80_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret: @@ -4141,11 +4141,11 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB80_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB80_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -4177,12 +4177,12 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB81_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB81_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4205,11 +4205,11 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB81_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB81_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret_offset: @@ -4229,11 +4229,11 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB81_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB81_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst @@ -4272,10 +4272,10 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB82_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB82_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4297,10 +4297,10 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB82_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB82_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4323,10 +4323,10 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB82_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB82_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4366,10 +4366,10 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB83_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB83_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4393,10 +4393,10 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB83_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB83_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_ret_offset: @@ -4417,10 +4417,10 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB83_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB83_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4463,12 +4463,12 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB84_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB84_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4500,11 +4500,11 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB84_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB84_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret_scalar: @@ -4527,11 +4527,11 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB84_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB84_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -4571,12 +4571,12 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB85_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4608,11 +4608,11 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB85_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret_offset_scalar: @@ -4635,11 +4635,11 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB85_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB85_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst @@ -4682,10 +4682,10 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB86_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB86_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4718,10 +4718,10 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB86_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB86_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_ret_scalar: @@ -4745,10 +4745,10 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB86_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB86_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -4790,10 +4790,10 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB87_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB87_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4826,10 +4826,10 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB87_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_ret_offset_scalar: @@ -4853,10 +4853,10 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB87_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB87_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst @@ -4896,10 +4896,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB88_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB88_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -4933,9 +4934,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB88_1 +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB88_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4966,9 +4968,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB88_1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB88_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -5010,12 +5013,12 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB89_1 +; SI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB89_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -5053,10 +5056,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; VI-NEXT: s_cbranch_execnz .LBB89_1 +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; VI-NEXT: s_cbranch_scc1 .LBB89_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5089,10 +5092,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB89_1 +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm @@ -5137,10 +5140,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB90_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB90_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -5172,9 +5176,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB90_1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -5205,9 +5210,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB90_1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB90_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -5248,12 +5254,12 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB91_1 +; SI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB91_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -5289,10 +5295,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB91_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5325,10 +5331,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB91_1 +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm @@ -5365,12 +5371,12 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB92_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB92_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5393,11 +5399,11 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB92_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB92_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: @@ -5417,11 +5423,11 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB92_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB92_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -5460,10 +5466,10 @@ define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB93_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB93_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5487,10 +5493,10 @@ define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB93_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB93_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: @@ -5511,10 +5517,10 @@ define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB93_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB93_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5553,12 +5559,12 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB94_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB94_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5579,11 +5585,11 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB94_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB94_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret: @@ -5603,11 +5609,11 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB94_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB94_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -5639,12 +5645,12 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB95_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB95_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5667,11 +5673,11 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB95_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB95_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret_offset: @@ -5691,11 +5697,11 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB95_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB95_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst @@ -5734,10 +5740,10 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB96_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB96_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5759,10 +5765,10 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB96_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB96_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5785,10 +5791,10 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB96_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB96_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5828,10 +5834,10 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB97_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB97_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5855,10 +5861,10 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB97_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB97_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_ret_offset: @@ -5879,10 +5885,10 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB97_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB97_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5925,12 +5931,12 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB98_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB98_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -5962,11 +5968,11 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB98_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB98_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret_scalar: @@ -5989,11 +5995,11 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB98_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB98_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -6033,12 +6039,12 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB99_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6070,11 +6076,11 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB99_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB99_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret_offset_scalar: @@ -6097,11 +6103,11 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB99_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB99_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst @@ -6144,10 +6150,10 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB100_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB100_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6180,10 +6186,10 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB100_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB100_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_ret_scalar: @@ -6207,10 +6213,10 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB100_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB100_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -6252,10 +6258,10 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB101_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6288,10 +6294,10 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB101_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_ret_offset_scalar: @@ -6315,10 +6321,10 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB101_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB101_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst @@ -6358,10 +6364,11 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB102_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -6395,9 +6402,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB102_1 +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB102_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -6428,9 +6436,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB102_1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB102_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -6472,12 +6481,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB103_1 +; SI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB103_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -6515,10 +6524,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; VI-NEXT: s_cbranch_execnz .LBB103_1 +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; VI-NEXT: s_cbranch_scc1 .LBB103_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -6551,10 +6560,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB103_1 +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm @@ -6598,12 +6607,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB104_1 +; SI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB104_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -6639,10 +6648,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB104_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -6675,10 +6684,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB104_1 +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB104_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm @@ -6715,12 +6724,12 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB105_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB105_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6743,11 +6752,11 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB105_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB105_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: @@ -6767,11 +6776,11 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB105_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB105_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6810,10 +6819,10 @@ define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB106_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB106_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6837,10 +6846,10 @@ define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB106_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB106_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: @@ -6861,10 +6870,10 @@ define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB106_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB106_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6903,12 +6912,12 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB107_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB107_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6929,11 +6938,11 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB107_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB107_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret: @@ -6953,11 +6962,11 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB107_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB107_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -6989,12 +6998,12 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB108_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB108_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7017,11 +7026,11 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB108_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB108_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret_offset: @@ -7041,11 +7050,11 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB108_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst @@ -7084,10 +7093,10 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB109_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB109_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7109,10 +7118,10 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB109_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB109_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7135,10 +7144,10 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB109_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7178,10 +7187,10 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB110_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB110_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7205,10 +7214,10 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB110_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB110_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_ret_offset: @@ -7229,10 +7238,10 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB110_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB110_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7275,12 +7284,12 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB111_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB111_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7312,11 +7321,11 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB111_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB111_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret_scalar: @@ -7339,11 +7348,11 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB111_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB111_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -7383,12 +7392,12 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB112_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB112_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7420,11 +7429,11 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB112_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB112_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret_offset_scalar: @@ -7447,11 +7456,11 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB112_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB112_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst @@ -7494,10 +7503,10 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB113_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB113_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7530,10 +7539,10 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB113_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB113_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_ret_scalar: @@ -7557,10 +7566,10 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB113_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB113_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -7602,10 +7611,10 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB114_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB114_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7638,10 +7647,10 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB114_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_ret_offset_scalar: @@ -7665,10 +7674,10 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB114_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB114_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst @@ -7701,12 +7710,12 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB115_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB115_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7729,11 +7738,11 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB115_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB115_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: @@ -7753,11 +7762,11 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB115_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB115_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7796,10 +7805,10 @@ define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB116_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB116_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7823,10 +7832,10 @@ define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB116_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB116_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: @@ -7847,10 +7856,10 @@ define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB116_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB116_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7889,12 +7898,12 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB117_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB117_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7915,11 +7924,11 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB117_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB117_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret: @@ -7939,11 +7948,11 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB117_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB117_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -7975,12 +7984,12 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB118_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB118_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8003,11 +8012,11 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB118_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB118_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret_offset: @@ -8027,11 +8036,11 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB118_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB118_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst @@ -8070,10 +8079,10 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB119_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB119_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8095,10 +8104,10 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB119_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB119_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -8121,10 +8130,10 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB119_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB119_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8164,10 +8173,10 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB120_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB120_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8191,10 +8200,10 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB120_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB120_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_ret_offset: @@ -8215,10 +8224,10 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB120_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB120_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8261,12 +8270,12 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB121_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB121_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8298,11 +8307,11 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB121_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB121_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret_scalar: @@ -8325,11 +8334,11 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB121_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB121_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -8369,12 +8378,12 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB122_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB122_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8406,11 +8415,11 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB122_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB122_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret_offset_scalar: @@ -8433,11 +8442,11 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB122_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB122_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst @@ -8480,10 +8489,10 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB123_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB123_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8516,10 +8525,10 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB123_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB123_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_ret_scalar: @@ -8543,10 +8552,10 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB123_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB123_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -8588,10 +8597,10 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB124_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB124_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8624,10 +8633,10 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB124_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_ret_offset_scalar: @@ -8651,10 +8660,10 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB124_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB124_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst @@ -8694,10 +8703,11 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB125_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB125_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -8731,9 +8741,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB125_1 +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB125_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -8764,9 +8775,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB125_1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB125_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -8808,12 +8820,12 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB126_1 +; SI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB126_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -8851,10 +8863,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; VI-NEXT: s_cbranch_execnz .LBB126_1 +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; VI-NEXT: s_cbranch_scc1 .LBB126_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8887,10 +8899,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB126_1 +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm @@ -8933,10 +8945,11 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB127_1 +; SI-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB127_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -8964,9 +8977,10 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB127_1 +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB127_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -8993,9 +9007,10 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB127_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB127_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -9035,12 +9050,12 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB128_1 +; SI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB128_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -9076,10 +9091,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB128_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -9112,10 +9127,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB128_1 +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm @@ -9152,12 +9167,12 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB129_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB129_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9180,11 +9195,11 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB129_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB129_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: @@ -9204,11 +9219,11 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB129_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB129_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -9247,10 +9262,10 @@ define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB130_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB130_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9274,10 +9289,10 @@ define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB130_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB130_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: @@ -9298,10 +9313,10 @@ define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB130_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB130_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 44cd2c6e3af675..2bcc643f9ea0b8 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -22,8 +22,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] @@ -46,9 +47,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX7LESS-NEXT: .LBB0_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -58,8 +60,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -78,9 +81,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-NEXT: .LBB0_3: ; GFX9-NEXT: s_endpgm ; @@ -90,8 +94,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] @@ -111,8 +116,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1064-NEXT: .LBB0_3: ; GFX1064-NEXT: s_endpgm ; @@ -122,8 +128,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 @@ -142,20 +149,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -173,11 +183,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s0 @@ -197,8 +208,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] @@ -221,9 +233,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX7LESS-DPP-NEXT: .LBB0_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -233,8 +246,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -253,9 +267,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-DPP-NEXT: .LBB0_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -265,8 +280,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] @@ -286,8 +302,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1064-DPP-NEXT: .LBB0_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -297,8 +314,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 @@ -317,20 +335,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -348,11 +369,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 @@ -415,9 +437,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -436,9 +459,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX7LESS-NEXT: .LBB1_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -485,9 +509,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -502,9 +527,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX9-NEXT: .LBB1_5: ; GFX9-NEXT: s_endpgm ; @@ -551,9 +577,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB1_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -569,8 +596,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1064-NEXT: .LBB1_5: ; GFX1064-NEXT: s_endpgm ; @@ -617,9 +645,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB1_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -634,8 +663,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1032-NEXT: .LBB1_5: ; GFX1032-NEXT: s_endpgm ; @@ -673,12 +703,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB1_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -721,11 +752,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB1_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -778,9 +810,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -848,8 +881,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -864,9 +898,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-DPP-NEXT: .LBB1_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -920,18 +955,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -947,8 +985,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1064-DPP-NEXT: .LBB1_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -1001,14 +1040,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -1023,8 +1065,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1032-DPP-NEXT: .LBB1_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -1074,21 +1117,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -1141,15 +1187,18 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -1175,8 +1224,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -1203,9 +1253,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -1221,8 +1272,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1245,9 +1297,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; @@ -1263,8 +1316,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 @@ -1286,8 +1340,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1064-NEXT: .LBB2_3: ; GFX1064-NEXT: s_endpgm ; @@ -1303,8 +1358,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 @@ -1325,8 +1381,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; @@ -1336,15 +1393,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1364,13 +1422,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1395,8 +1454,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -1423,9 +1483,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX7LESS-DPP-NEXT: .LBB2_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -1441,8 +1502,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1465,9 +1527,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -1483,8 +1546,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -1506,8 +1570,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1064-DPP-NEXT: .LBB2_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -1523,8 +1588,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -1545,8 +1611,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -1556,15 +1623,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1584,13 +1652,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1653,9 +1722,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -1674,9 +1744,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX7LESS-NEXT: .LBB3_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -1723,9 +1794,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -1740,9 +1812,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; @@ -1789,9 +1862,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -1807,8 +1881,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1064-NEXT: .LBB3_5: ; GFX1064-NEXT: s_endpgm ; @@ -1855,9 +1930,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -1872,8 +1948,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1032-NEXT: .LBB3_5: ; GFX1032-NEXT: s_endpgm ; @@ -1911,12 +1988,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB3_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -1959,11 +2037,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB3_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -2016,9 +2095,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -2086,8 +2166,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2102,9 +2183,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2158,18 +2240,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2185,8 +2270,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1064-DPP-NEXT: .LBB3_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -2239,14 +2325,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2261,8 +2350,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1032-DPP-NEXT: .LBB3_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -2312,21 +2402,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -2379,15 +2472,18 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -2413,8 +2509,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -2441,9 +2538,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX7LESS-NEXT: .LBB4_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -2459,8 +2557,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -2483,9 +2582,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-NEXT: .LBB4_3: ; GFX9-NEXT: s_endpgm ; @@ -2501,8 +2601,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 @@ -2524,8 +2625,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1064-NEXT: .LBB4_3: ; GFX1064-NEXT: s_endpgm ; @@ -2541,8 +2643,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 @@ -2563,8 +2666,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; @@ -2574,15 +2678,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2605,9 +2710,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1164-NEXT: .LBB4_3: ; GFX1164-NEXT: s_endpgm ; @@ -2618,13 +2724,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2645,9 +2752,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; @@ -2663,8 +2771,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -2691,9 +2800,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX7LESS-DPP-NEXT: .LBB4_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -2709,8 +2819,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -2733,9 +2844,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-DPP-NEXT: .LBB4_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2751,8 +2863,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -2774,8 +2887,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1064-DPP-NEXT: .LBB4_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -2791,8 +2905,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -2813,8 +2928,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -2824,15 +2940,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2855,9 +2972,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1164-DPP-NEXT: .LBB4_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -2868,13 +2986,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2895,9 +3014,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic @@ -2951,9 +3071,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -2972,9 +3093,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX7LESS-NEXT: .LBB5_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -3021,9 +3143,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -3038,9 +3161,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9-NEXT: .LBB5_5: ; GFX9-NEXT: s_endpgm ; @@ -3087,9 +3211,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -3105,8 +3230,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064-NEXT: .LBB5_5: ; GFX1064-NEXT: s_endpgm ; @@ -3153,9 +3279,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -3170,8 +3297,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032-NEXT: .LBB5_5: ; GFX1032-NEXT: s_endpgm ; @@ -3209,12 +3337,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -3257,11 +3386,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -3314,9 +3444,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -3384,8 +3515,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3400,9 +3532,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX9-DPP-NEXT: .LBB5_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -3456,18 +3589,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -3483,8 +3619,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1064-DPP-NEXT: .LBB5_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -3537,14 +3674,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -3559,8 +3699,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1032-DPP-NEXT: .LBB5_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -3610,21 +3751,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -3677,15 +3821,18 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -3745,9 +3892,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB6_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -3766,9 +3914,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX7LESS-NEXT: .LBB6_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -3815,9 +3964,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB6_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -3832,9 +3982,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB6_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX9-NEXT: .LBB6_5: ; GFX9-NEXT: s_endpgm ; @@ -3881,9 +4032,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB6_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -3899,8 +4051,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX1064-NEXT: .LBB6_5: ; GFX1064-NEXT: s_endpgm ; @@ -3947,9 +4100,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB6_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -3964,8 +4118,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX1032-NEXT: .LBB6_5: ; GFX1032-NEXT: s_endpgm ; @@ -4003,12 +4158,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB6_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -4051,11 +4207,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB6_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -4108,9 +4265,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -4178,8 +4336,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -4194,9 +4353,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -4250,18 +4410,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -4277,8 +4440,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -4331,14 +4495,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -4353,8 +4520,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -4404,21 +4572,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -4471,15 +4642,18 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -4505,8 +4679,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -4533,9 +4708,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX7LESS-NEXT: .LBB7_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -4551,8 +4727,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -4575,9 +4752,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX9-NEXT: .LBB7_3: ; GFX9-NEXT: s_endpgm ; @@ -4593,8 +4771,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 @@ -4616,8 +4795,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1064-NEXT: .LBB7_3: ; GFX1064-NEXT: s_endpgm ; @@ -4633,8 +4813,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 @@ -4655,8 +4836,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm ; @@ -4666,15 +4848,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -4697,9 +4880,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1164-NEXT: .LBB7_3: ; GFX1164-NEXT: s_endpgm ; @@ -4710,13 +4894,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -4737,9 +4922,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; @@ -4755,8 +4941,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -4783,9 +4970,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX7LESS-DPP-NEXT: .LBB7_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -4801,8 +4989,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -4825,9 +5014,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -4843,8 +5033,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -4866,8 +5057,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -4883,8 +5075,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -4905,8 +5098,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -4916,15 +5110,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -4947,9 +5142,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -4960,13 +5156,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -4987,9 +5184,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 @@ -5042,9 +5240,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB8_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -5063,9 +5262,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX7LESS-NEXT: .LBB8_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -5112,9 +5312,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB8_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -5129,9 +5330,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB8_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX9-NEXT: .LBB8_5: ; GFX9-NEXT: s_endpgm ; @@ -5178,9 +5380,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB8_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -5196,8 +5399,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1064-NEXT: .LBB8_5: ; GFX1064-NEXT: s_endpgm ; @@ -5244,9 +5448,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB8_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -5261,8 +5466,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1032-NEXT: .LBB8_5: ; GFX1032-NEXT: s_endpgm ; @@ -5300,12 +5506,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB8_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -5321,9 +5528,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1164-NEXT: .LBB8_5: ; GFX1164-NEXT: s_endpgm ; @@ -5362,11 +5570,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB8_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -5381,9 +5590,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1132-NEXT: .LBB8_5: ; GFX1132-NEXT: s_endpgm ; @@ -5431,9 +5641,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -5501,8 +5712,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -5517,9 +5729,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-DPP-NEXT: .LBB8_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -5573,18 +5786,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5600,8 +5816,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1064-DPP-NEXT: .LBB8_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -5654,14 +5871,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5676,8 +5896,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -5727,21 +5948,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5757,9 +5981,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1164-DPP-NEXT: .LBB8_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -5807,16 +6032,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5831,9 +6059,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() strictfp @@ -5857,8 +6086,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-NEXT: s_mov_b32 s40, s7 @@ -5912,13 +6142,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -5934,14 +6165,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s48, s48, s9 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -5952,9 +6183,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start @@ -5994,8 +6226,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; @@ -6010,13 +6243,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX1064-NEXT: s_mov_b32 s40, s7 @@ -6072,8 +6306,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; @@ -6089,12 +6324,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s8 ; GFX1032-NEXT: s_mov_b32 s40, s7 @@ -6149,8 +6385,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm ; @@ -6160,14 +6397,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 @@ -6220,8 +6458,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1164-NEXT: .LBB9_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -6231,13 +6471,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-NEXT: s_mov_b32 s44, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s6 ; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 @@ -6285,8 +6527,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1132-NEXT: .LBB9_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -6306,8 +6550,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 @@ -6361,13 +6606,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX7LESS-DPP-NEXT: .LBB9_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -6383,14 +6629,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -6401,9 +6647,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start @@ -6443,8 +6690,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -6459,13 +6707,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 @@ -6521,8 +6770,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -6538,12 +6788,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 @@ -6598,8 +6849,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -6609,14 +6861,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 @@ -6669,8 +6922,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -6680,13 +6935,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s6 ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 @@ -6734,8 +6991,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -6797,9 +7056,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB10_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 @@ -6841,13 +7101,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX7LESS-NEXT: .LBB10_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -6902,9 +7163,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB10_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -6948,8 +7210,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB10_4 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm ; @@ -7004,9 +7267,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB10_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -7051,8 +7315,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm ; @@ -7107,9 +7372,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB10_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -7153,8 +7419,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm ; @@ -7200,12 +7467,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB10_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -7248,8 +7516,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -7296,11 +7566,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB10_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -7338,8 +7609,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -7420,13 +7693,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -7517,8 +7791,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -7562,8 +7837,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[46:47] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[46:47] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -7646,8 +7922,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -7692,8 +7969,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -7771,8 +8049,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -7816,8 +8095,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -7896,13 +8176,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -7945,8 +8226,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -8014,15 +8297,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8060,8 +8344,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -8083,8 +8369,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] @@ -8113,10 +8400,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX7LESS-NEXT: .LBB11_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -8132,8 +8420,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -8157,9 +8446,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX9-NEXT: .LBB11_3: ; GFX9-NEXT: s_endpgm ; @@ -8175,8 +8465,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 @@ -8199,8 +8490,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1064-NEXT: .LBB11_3: ; GFX1064-NEXT: s_endpgm ; @@ -8216,8 +8508,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 @@ -8239,8 +8532,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1032-NEXT: .LBB11_3: ; GFX1032-NEXT: s_endpgm ; @@ -8250,15 +8544,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8282,9 +8577,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1164-NEXT: .LBB11_3: ; GFX1164-NEXT: s_endpgm ; @@ -8295,13 +8591,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8322,9 +8619,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm ; @@ -8340,8 +8638,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] @@ -8370,10 +8669,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX7LESS-DPP-NEXT: .LBB11_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -8389,8 +8689,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -8414,9 +8715,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -8432,8 +8734,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -8456,8 +8759,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -8473,8 +8777,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 @@ -8496,8 +8801,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -8507,15 +8813,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8539,9 +8846,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -8552,13 +8860,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8579,9 +8888,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1, !amdgpu.ignore.denormal.mode !1 @@ -8636,9 +8946,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB12_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB12_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -8659,10 +8970,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX7LESS-NEXT: .LBB12_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -8711,9 +9023,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB12_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB12_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 @@ -8729,9 +9042,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB12_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX9-NEXT: .LBB12_5: ; GFX9-NEXT: s_endpgm ; @@ -8780,9 +9094,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB12_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB12_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 @@ -8799,8 +9114,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX1064-NEXT: .LBB12_5: ; GFX1064-NEXT: s_endpgm ; @@ -8849,9 +9165,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB12_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB12_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 @@ -8867,8 +9184,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX1032-NEXT: .LBB12_5: ; GFX1032-NEXT: s_endpgm ; @@ -8908,12 +9226,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB12_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB12_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 @@ -8930,9 +9249,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX1164-NEXT: .LBB12_5: ; GFX1164-NEXT: s_endpgm ; @@ -8972,11 +9292,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB12_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB12_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 @@ -8991,9 +9312,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX1132-NEXT: .LBB12_5: ; GFX1132-NEXT: s_endpgm ; @@ -9043,10 +9365,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -9130,11 +9453,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -9149,9 +9473,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX9-DPP-NEXT: .LBB12_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -9224,12 +9549,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -9246,8 +9572,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1064-DPP-NEXT: .LBB12_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -9314,13 +9641,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -9336,8 +9664,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -9407,16 +9736,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -9433,9 +9762,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1164-DPP-NEXT: .LBB12_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -9496,15 +9826,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -9519,9 +9850,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() strictfp @@ -9542,8 +9874,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] @@ -9572,10 +9905,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX7LESS-NEXT: .LBB13_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -9591,8 +9925,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -9616,9 +9951,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-NEXT: .LBB13_3: ; GFX9-NEXT: s_endpgm ; @@ -9634,8 +9970,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 @@ -9658,8 +9995,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1064-NEXT: .LBB13_3: ; GFX1064-NEXT: s_endpgm ; @@ -9675,8 +10013,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 @@ -9698,8 +10037,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm ; @@ -9709,15 +10049,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -9741,9 +10082,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1164-NEXT: .LBB13_3: ; GFX1164-NEXT: s_endpgm ; @@ -9754,13 +10096,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -9781,9 +10124,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; @@ -9799,8 +10143,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] @@ -9829,10 +10174,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX7LESS-DPP-NEXT: .LBB13_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -9848,8 +10194,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -9873,9 +10220,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-DPP-NEXT: .LBB13_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -9891,8 +10239,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -9915,8 +10264,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1064-DPP-NEXT: .LBB13_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -9932,8 +10282,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 @@ -9955,8 +10306,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -9966,15 +10318,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -9998,9 +10351,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1164-DPP-NEXT: .LBB13_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -10011,13 +10365,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -10038,9 +10393,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !1, !amdgpu.ignore.denormal.mode !1 @@ -10095,9 +10451,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB14_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -10118,10 +10475,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB14_4 ; GFX7LESS-NEXT: .LBB14_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -10170,9 +10528,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB14_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 @@ -10188,9 +10547,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB14_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB14_4 ; GFX9-NEXT: .LBB14_5: ; GFX9-NEXT: s_endpgm ; @@ -10239,9 +10599,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB14_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 @@ -10258,8 +10619,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_4 ; GFX1064-NEXT: .LBB14_5: ; GFX1064-NEXT: s_endpgm ; @@ -10308,9 +10670,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB14_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 @@ -10326,8 +10689,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_4 ; GFX1032-NEXT: .LBB14_5: ; GFX1032-NEXT: s_endpgm ; @@ -10367,12 +10731,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB14_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 @@ -10389,9 +10754,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_4 ; GFX1164-NEXT: .LBB14_5: ; GFX1164-NEXT: s_endpgm ; @@ -10431,11 +10797,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB14_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 @@ -10450,9 +10817,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_4 ; GFX1132-NEXT: .LBB14_5: ; GFX1132-NEXT: s_endpgm ; @@ -10502,10 +10870,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -10589,11 +10958,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -10608,9 +10978,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX9-DPP-NEXT: .LBB14_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -10683,12 +11054,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -10705,8 +11077,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX1064-DPP-NEXT: .LBB14_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -10773,13 +11146,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -10795,8 +11169,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX1032-DPP-NEXT: .LBB14_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -10866,16 +11241,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -10892,9 +11267,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX1164-DPP-NEXT: .LBB14_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -10955,15 +11331,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -10978,9 +11355,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX1132-DPP-NEXT: .LBB14_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -11036,9 +11414,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB15_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB15_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -11059,10 +11438,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX7LESS-NEXT: .LBB15_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -11111,9 +11491,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB15_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB15_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 @@ -11129,9 +11510,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX9-NEXT: .LBB15_5: ; GFX9-NEXT: s_endpgm ; @@ -11180,9 +11562,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB15_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB15_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 @@ -11199,8 +11582,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX1064-NEXT: .LBB15_5: ; GFX1064-NEXT: s_endpgm ; @@ -11249,9 +11633,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB15_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB15_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 @@ -11267,8 +11652,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX1032-NEXT: .LBB15_5: ; GFX1032-NEXT: s_endpgm ; @@ -11308,12 +11694,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB15_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB15_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 @@ -11330,9 +11717,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX1164-NEXT: .LBB15_5: ; GFX1164-NEXT: s_endpgm ; @@ -11372,11 +11760,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB15_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB15_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 @@ -11391,9 +11780,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX1132-NEXT: .LBB15_5: ; GFX1132-NEXT: s_endpgm ; @@ -11443,10 +11833,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -11530,11 +11921,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB15_3 ; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -11549,9 +11941,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX9-DPP-NEXT: .LBB15_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -11624,12 +12017,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB15_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -11646,8 +12040,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX1064-DPP-NEXT: .LBB15_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -11714,13 +12109,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB15_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -11736,8 +12132,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX1032-DPP-NEXT: .LBB15_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -11807,16 +12204,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB15_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -11833,9 +12230,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX1164-DPP-NEXT: .LBB15_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -11896,15 +12294,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB15_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -11919,9 +12318,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX1132-DPP-NEXT: .LBB15_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp @@ -11947,10 +12347,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 ; GFX7LESS-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-NEXT: s_mov_b32 s40, s7 @@ -12002,13 +12403,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -12031,8 +12433,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 @@ -12086,8 +12489,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; @@ -12108,8 +12512,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 @@ -12164,8 +12569,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; @@ -12186,8 +12592,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 @@ -12241,8 +12648,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; @@ -12258,7 +12666,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16 ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -12266,8 +12673,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s8 @@ -12318,8 +12727,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1164-NEXT: .LBB16_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -12337,14 +12748,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 @@ -12389,8 +12801,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1132-NEXT: .LBB16_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -12412,10 +12826,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 ; GFX7LESS-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 @@ -12467,13 +12882,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX7LESS-DPP-NEXT: .LBB16_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -12496,8 +12912,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 @@ -12551,8 +12968,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -12573,8 +12991,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 @@ -12629,8 +13048,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -12651,8 +13071,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 @@ -12706,8 +13127,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -12723,7 +13145,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -12731,8 +13152,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 @@ -12783,8 +13206,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -12802,14 +13227,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 @@ -12854,8 +13280,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -12917,9 +13345,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB17_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB17_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 @@ -12961,13 +13390,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB17_4 ; GFX7LESS-NEXT: .LBB17_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -13022,9 +13452,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB17_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB17_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -13068,8 +13499,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB17_4 ; GFX9-NEXT: .LBB17_5: ; GFX9-NEXT: s_endpgm ; @@ -13124,9 +13556,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB17_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB17_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -13171,8 +13604,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_4 ; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm ; @@ -13227,9 +13661,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB17_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB17_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -13273,8 +13708,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_4 ; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm ; @@ -13320,12 +13756,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB17_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB17_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -13368,8 +13805,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_4 ; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -13416,11 +13855,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB17_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB17_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -13458,8 +13898,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_4 ; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -13540,13 +13982,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -13637,8 +14080,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB17_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -13682,8 +14126,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[46:47] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[46:47] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB17_2 ; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -13766,8 +14211,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB17_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -13812,8 +14258,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB17_2 ; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -13891,8 +14338,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB17_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -13936,8 +14384,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB17_2 ; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -14016,13 +14465,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -14065,8 +14515,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB17_2 ; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -14134,15 +14586,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -14180,8 +14633,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB17_2 ; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -14197,8 +14652,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB18_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] @@ -14221,9 +14677,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB18_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX7LESS-NEXT: .LBB18_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -14233,8 +14690,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB18_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -14253,9 +14711,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB18_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX9-NEXT: .LBB18_3: ; GFX9-NEXT: s_endpgm ; @@ -14265,8 +14724,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB18_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] @@ -14286,8 +14746,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX1064-NEXT: .LBB18_3: ; GFX1064-NEXT: s_endpgm ; @@ -14297,8 +14758,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB18_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 @@ -14317,20 +14779,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX1032-NEXT: .LBB18_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB18_2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -14348,11 +14813,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB18_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s0 @@ -14372,8 +14838,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] @@ -14396,9 +14863,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX7LESS-DPP-NEXT: .LBB18_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -14408,8 +14876,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -14428,9 +14897,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX9-DPP-NEXT: .LBB18_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -14440,8 +14910,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] @@ -14461,8 +14932,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX1064-DPP-NEXT: .LBB18_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -14472,8 +14944,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 @@ -14492,20 +14965,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX1032-DPP-NEXT: .LBB18_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -14523,11 +14999,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 @@ -14551,8 +15028,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB19_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] @@ -14575,9 +15053,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB19_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX7LESS-NEXT: .LBB19_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -14587,8 +15066,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB19_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -14607,9 +15087,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB19_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX9-NEXT: .LBB19_3: ; GFX9-NEXT: s_endpgm ; @@ -14619,8 +15100,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB19_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] @@ -14640,8 +15122,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX1064-NEXT: .LBB19_3: ; GFX1064-NEXT: s_endpgm ; @@ -14651,8 +15134,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB19_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 @@ -14671,20 +15155,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX1032-NEXT: .LBB19_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB19_2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB19_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -14702,11 +15189,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB19_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB19_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s0 @@ -14726,8 +15214,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] @@ -14750,9 +15239,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX7LESS-DPP-NEXT: .LBB19_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -14762,8 +15252,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -14782,9 +15273,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX9-DPP-NEXT: .LBB19_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -14794,8 +15286,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] @@ -14815,8 +15308,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX1064-DPP-NEXT: .LBB19_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -14826,8 +15320,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 @@ -14846,20 +15341,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX1032-DPP-NEXT: .LBB19_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB19_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -14877,11 +15375,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB19_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index f0196fadc4b3fd..2d0ffa13237628 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -21,8 +21,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -43,9 +44,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX7LESS-NEXT: .LBB0_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -54,8 +56,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -72,9 +75,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-NEXT: .LBB0_3: ; GFX9-NEXT: s_endpgm ; @@ -83,8 +87,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -98,8 +103,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -112,11 +118,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -131,10 +138,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -150,8 +158,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -172,9 +181,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX7LESS-DPP-NEXT: .LBB0_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -183,8 +193,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -201,9 +212,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-DPP-NEXT: .LBB0_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -212,8 +224,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -227,8 +240,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -241,11 +255,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -260,10 +275,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -325,9 +341,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -348,9 +365,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX7LESS-NEXT: .LBB1_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -399,9 +417,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -418,9 +437,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX9-NEXT: .LBB1_5: ; GFX9-NEXT: s_endpgm ; @@ -469,9 +489,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB1_4 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -524,9 +545,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB1_4 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -571,12 +593,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB1_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -621,11 +644,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB1_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -680,9 +704,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -757,8 +782,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -775,9 +801,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-DPP-NEXT: .LBB1_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -839,18 +866,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -904,9 +934,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 @@ -914,13 +944,16 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -966,12 +999,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -985,21 +1018,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -1043,10 +1079,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -1056,17 +1092,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -1085,8 +1124,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1107,9 +1147,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -1118,8 +1159,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -1136,9 +1178,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; @@ -1147,8 +1190,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -1162,8 +1206,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -1176,11 +1221,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -1195,10 +1241,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -1214,8 +1261,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1236,9 +1284,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX7LESS-DPP-NEXT: .LBB2_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -1247,8 +1296,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -1265,9 +1315,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -1276,8 +1327,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -1291,8 +1343,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -1305,11 +1358,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -1324,10 +1378,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -1390,9 +1445,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -1413,9 +1469,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX7LESS-NEXT: .LBB3_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -1464,9 +1521,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -1483,9 +1541,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; @@ -1534,9 +1593,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB3_4 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -1589,9 +1649,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB3_4 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -1636,12 +1697,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB3_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -1686,11 +1748,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB3_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -1745,9 +1808,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -1822,8 +1886,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -1840,9 +1905,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -1904,18 +1970,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -1969,9 +2038,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 @@ -1979,13 +2048,16 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -2031,12 +2103,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -2050,21 +2122,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -2108,10 +2183,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -2121,17 +2196,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -2151,8 +2229,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -2173,9 +2252,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX7LESS-NEXT: .LBB4_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -2184,8 +2264,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -2202,9 +2283,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-NEXT: .LBB4_3: ; GFX9-NEXT: s_endpgm ; @@ -2213,8 +2295,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -2228,8 +2311,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -2242,11 +2326,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -2261,10 +2346,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -2280,8 +2366,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2302,9 +2389,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX7LESS-DPP-NEXT: .LBB4_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -2313,8 +2401,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -2331,9 +2420,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-DPP-NEXT: .LBB4_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2342,8 +2432,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -2357,8 +2448,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -2371,11 +2463,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -2390,10 +2483,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -2455,9 +2549,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -2478,9 +2573,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX7LESS-NEXT: .LBB5_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -2529,9 +2625,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -2548,9 +2645,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9-NEXT: .LBB5_5: ; GFX9-NEXT: s_endpgm ; @@ -2599,9 +2697,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -2654,9 +2753,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -2701,12 +2801,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -2751,11 +2852,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -2810,9 +2912,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -2887,8 +2990,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2905,9 +3009,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX9-DPP-NEXT: .LBB5_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2969,18 +3074,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -3034,9 +3142,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 @@ -3044,13 +3152,16 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -3096,12 +3207,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -3115,21 +3226,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -3173,10 +3287,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -3186,17 +3300,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -3223,8 +3340,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-NEXT: s_mov_b32 s40, s7 @@ -3277,13 +3395,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -3297,25 +3416,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s48, s48, s9 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3356,8 +3476,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm ; @@ -3371,13 +3492,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX1064-NEXT: s_add_u32 s48, s48, s9 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s8 @@ -3431,8 +3553,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm ; @@ -3446,13 +3569,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s8 @@ -3505,8 +3629,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm ; @@ -3514,15 +3639,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-NEXT: s_mov_b32 s33, s8 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 @@ -3572,8 +3698,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1164-NEXT: .LBB6_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -3582,14 +3710,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-NEXT: s_mov_b32 s44, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-NEXT: s_mov_b32 s33, s15 ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -3634,8 +3764,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1132-NEXT: .LBB6_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -3653,8 +3785,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 @@ -3707,13 +3840,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX7LESS-DPP-NEXT: .LBB6_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -3727,25 +3861,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3786,8 +3921,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -3801,13 +3937,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 @@ -3861,8 +3998,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -3876,13 +4014,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 @@ -3935,8 +4074,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -3944,15 +4084,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 @@ -4002,8 +4143,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -4012,14 +4155,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -4064,8 +4209,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -4129,9 +4276,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB7_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB7_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 @@ -4175,13 +4323,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX7LESS-NEXT: .LBB7_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -4238,9 +4387,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB7_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -4286,8 +4436,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm ; @@ -4344,9 +4495,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB7_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB7_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -4393,8 +4545,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX1064-NEXT: .LBB7_5: ; GFX1064-NEXT: s_endpgm ; @@ -4432,50 +4585,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_max_f64 v[3:4], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB7_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB7_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] ; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 @@ -4486,21 +4640,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s13, s40 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX1032-NEXT: .LBB7_5: ; GFX1032-NEXT: s_endpgm ; @@ -4548,12 +4703,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB7_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB7_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -4598,8 +4754,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -4649,11 +4807,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB7_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB7_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -4697,8 +4856,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -4779,12 +4940,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -4882,8 +5044,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -4929,8 +5092,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[46:47] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[46:47] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -5021,8 +5185,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5069,8 +5234,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -5154,8 +5320,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5201,8 +5368,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -5291,13 +5459,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5342,8 +5511,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -5417,17 +5588,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5468,8 +5641,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -5484,8 +5659,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -5509,10 +5685,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX7LESS-NEXT: .LBB8_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -5521,8 +5698,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -5541,9 +5719,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-NEXT: .LBB8_3: ; GFX9-NEXT: s_endpgm ; @@ -5552,8 +5731,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -5568,8 +5748,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -5583,11 +5764,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 @@ -5608,9 +5790,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1164-NEXT: .LBB8_3: ; GFX1164-NEXT: s_endpgm ; @@ -5618,10 +5801,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 @@ -5639,9 +5823,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm ; @@ -5650,8 +5835,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5675,10 +5861,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX7LESS-DPP-NEXT: .LBB8_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -5687,8 +5874,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -5707,9 +5895,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-DPP-NEXT: .LBB8_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -5718,8 +5907,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5734,8 +5924,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5749,11 +5940,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5774,9 +5966,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1164-DPP-NEXT: .LBB8_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -5784,10 +5977,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5805,9 +5999,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1 @@ -5864,9 +6059,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB9_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB9_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -5889,10 +6085,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB9_4 ; GFX7LESS-NEXT: .LBB9_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -5943,9 +6140,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB9_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB9_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 @@ -5963,9 +6161,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB9_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_4 ; GFX9-NEXT: .LBB9_5: ; GFX9-NEXT: s_endpgm ; @@ -6016,9 +6215,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB9_4 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -6073,9 +6273,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB9_4 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -6122,12 +6323,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB9_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB9_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 @@ -6147,9 +6349,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_4 ; GFX1164-NEXT: .LBB9_5: ; GFX1164-NEXT: s_endpgm ; @@ -6192,11 +6395,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB9_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB9_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 @@ -6215,9 +6419,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_4 ; GFX1132-NEXT: .LBB9_5: ; GFX1132-NEXT: s_endpgm ; @@ -6269,10 +6474,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -6363,11 +6569,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -6384,9 +6591,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -6467,12 +6675,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -6550,12 +6759,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -6640,16 +6850,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -6669,9 +6879,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -6738,17 +6949,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -6766,9 +6979,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -6790,8 +7004,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-NEXT: s_mov_b32 s40, s7 @@ -6844,13 +7059,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -6864,25 +7080,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s48, s48, s9 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start @@ -6923,8 +7140,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm ; @@ -6938,13 +7156,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX1064-NEXT: s_add_u32 s48, s48, s9 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s8 @@ -6998,8 +7217,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm ; @@ -7013,13 +7233,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s8 @@ -7072,8 +7293,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm ; @@ -7081,15 +7303,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-NEXT: s_mov_b32 s33, s8 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 @@ -7139,8 +7362,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1164-NEXT: .LBB10_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -7149,14 +7374,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-NEXT: s_mov_b32 s44, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-NEXT: s_mov_b32 s33, s15 ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -7201,8 +7428,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1132-NEXT: .LBB10_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -7220,8 +7449,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 @@ -7274,13 +7504,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX7LESS-DPP-NEXT: .LBB10_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -7294,25 +7525,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7353,8 +7585,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -7368,13 +7601,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 @@ -7428,8 +7662,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -7443,13 +7678,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 @@ -7502,8 +7738,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -7511,15 +7748,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 @@ -7569,8 +7807,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -7579,14 +7819,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -7631,8 +7873,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -7696,9 +7940,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB11_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB11_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 @@ -7742,13 +7987,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB11_4 ; GFX7LESS-NEXT: .LBB11_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -7805,9 +8051,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB11_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB11_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -7853,8 +8100,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_4 ; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm ; @@ -7911,9 +8159,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB11_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB11_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -7960,8 +8209,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_4 ; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm ; @@ -7999,50 +8249,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_max_f64 v[3:4], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB11_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB11_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] ; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 @@ -8053,21 +8304,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s13, s40 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_4 ; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm ; @@ -8115,12 +8367,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB11_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB11_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -8165,8 +8418,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_4 ; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -8216,11 +8471,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB11_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB11_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -8264,8 +8520,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_4 ; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -8346,12 +8604,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -8449,8 +8708,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8496,8 +8756,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[46:47] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[46:47] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -8588,8 +8849,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8636,8 +8898,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -8721,8 +8984,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8768,8 +9032,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -8858,13 +9123,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8909,8 +9175,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -8984,17 +9252,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9035,8 +9305,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -9051,8 +9323,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -9073,9 +9346,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX7LESS-NEXT: .LBB12_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -9084,8 +9358,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -9102,9 +9377,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX9-NEXT: .LBB12_3: ; GFX9-NEXT: s_endpgm ; @@ -9113,8 +9389,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB12_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -9128,8 +9405,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB12_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -9142,11 +9420,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB12_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -9161,10 +9440,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB12_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -9180,8 +9460,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9202,9 +9483,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX7LESS-DPP-NEXT: .LBB12_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -9213,8 +9495,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -9231,9 +9514,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX9-DPP-NEXT: .LBB12_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -9242,8 +9526,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9257,8 +9542,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9271,11 +9557,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_2 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9290,10 +9577,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -9313,8 +9601,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -9335,9 +9624,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX7LESS-NEXT: .LBB13_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -9346,8 +9636,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -9364,9 +9655,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-NEXT: .LBB13_3: ; GFX9-NEXT: s_endpgm ; @@ -9375,8 +9667,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB13_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -9390,8 +9683,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB13_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -9404,11 +9698,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB13_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -9423,10 +9718,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB13_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -9442,8 +9738,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9464,9 +9761,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX7LESS-DPP-NEXT: .LBB13_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -9475,8 +9773,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -9493,9 +9792,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-DPP-NEXT: .LBB13_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -9504,8 +9804,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9519,8 +9820,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9533,11 +9835,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_2 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9552,10 +9855,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index f672c9c6afa22b..a29e6f333bc56d 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -21,8 +21,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -43,9 +44,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX7LESS-NEXT: .LBB0_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -54,8 +56,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -72,9 +75,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-NEXT: .LBB0_3: ; GFX9-NEXT: s_endpgm ; @@ -83,8 +87,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -98,8 +103,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -112,11 +118,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -131,10 +138,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -150,8 +158,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -172,9 +181,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX7LESS-DPP-NEXT: .LBB0_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -183,8 +193,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -201,9 +212,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-DPP-NEXT: .LBB0_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -212,8 +224,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -227,8 +240,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -241,11 +255,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -260,10 +275,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -325,9 +341,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -348,9 +365,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX7LESS-NEXT: .LBB1_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -399,9 +417,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -418,9 +437,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX9-NEXT: .LBB1_5: ; GFX9-NEXT: s_endpgm ; @@ -469,9 +489,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB1_4 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -524,9 +545,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB1_4 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -571,12 +593,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB1_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -621,11 +644,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB1_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -680,9 +704,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -757,8 +782,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -775,9 +801,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-DPP-NEXT: .LBB1_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -839,18 +866,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -904,9 +934,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 @@ -914,13 +944,16 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -966,12 +999,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -985,21 +1018,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -1043,10 +1079,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_dual_min_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -1056,17 +1092,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -1085,8 +1124,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1107,9 +1147,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -1118,8 +1159,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -1136,9 +1178,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; @@ -1147,8 +1190,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -1162,8 +1206,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -1176,11 +1221,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -1195,10 +1241,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -1214,8 +1261,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1236,9 +1284,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX7LESS-DPP-NEXT: .LBB2_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -1247,8 +1296,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -1265,9 +1315,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -1276,8 +1327,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -1291,8 +1343,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -1305,11 +1358,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -1324,10 +1378,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -1390,9 +1445,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -1413,9 +1469,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX7LESS-NEXT: .LBB3_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -1464,9 +1521,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -1483,9 +1541,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; @@ -1534,9 +1593,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB3_4 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -1589,9 +1649,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB3_4 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -1636,12 +1697,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB3_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -1686,11 +1748,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB3_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -1745,9 +1808,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -1822,8 +1886,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -1840,9 +1905,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -1904,18 +1970,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -1969,9 +2038,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 @@ -1979,13 +2048,16 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -2031,12 +2103,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -2050,21 +2122,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -2108,10 +2183,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_dual_min_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -2121,17 +2196,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -2151,8 +2229,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -2173,9 +2252,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX7LESS-NEXT: .LBB4_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -2184,8 +2264,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -2202,9 +2283,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-NEXT: .LBB4_3: ; GFX9-NEXT: s_endpgm ; @@ -2213,8 +2295,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -2228,8 +2311,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -2242,11 +2326,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -2261,10 +2346,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -2280,8 +2366,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2302,9 +2389,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX7LESS-DPP-NEXT: .LBB4_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -2313,8 +2401,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -2331,9 +2420,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-DPP-NEXT: .LBB4_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2342,8 +2432,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -2357,8 +2448,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -2371,11 +2463,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -2390,10 +2483,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -2455,9 +2549,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -2478,9 +2573,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX7LESS-NEXT: .LBB5_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -2529,9 +2625,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -2548,9 +2645,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9-NEXT: .LBB5_5: ; GFX9-NEXT: s_endpgm ; @@ -2599,9 +2697,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -2654,9 +2753,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -2701,12 +2801,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -2751,11 +2852,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -2810,9 +2912,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -2887,8 +2990,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2905,9 +3009,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX9-DPP-NEXT: .LBB5_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2969,18 +3074,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -3034,9 +3142,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 @@ -3044,13 +3152,16 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -3096,12 +3207,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -3115,21 +3226,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -3173,10 +3287,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_dual_min_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -3186,17 +3300,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -3223,8 +3340,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-NEXT: s_mov_b32 s40, s7 @@ -3277,13 +3395,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -3297,25 +3416,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s48, s48, s9 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3356,8 +3476,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm ; @@ -3371,13 +3492,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX1064-NEXT: s_add_u32 s48, s48, s9 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s8 @@ -3431,8 +3553,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm ; @@ -3446,13 +3569,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s8 @@ -3505,8 +3629,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm ; @@ -3514,15 +3639,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-NEXT: s_mov_b32 s33, s8 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 @@ -3572,8 +3698,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1164-NEXT: .LBB6_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -3582,14 +3710,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-NEXT: s_mov_b32 s44, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-NEXT: s_mov_b32 s33, s15 ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -3634,8 +3764,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1132-NEXT: .LBB6_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -3653,8 +3785,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 @@ -3707,13 +3840,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX7LESS-DPP-NEXT: .LBB6_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -3727,25 +3861,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3786,8 +3921,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -3801,13 +3937,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 @@ -3861,8 +3998,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -3876,13 +4014,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 @@ -3935,8 +4074,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -3944,15 +4084,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 @@ -4002,8 +4143,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -4012,14 +4155,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -4064,8 +4209,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -4129,9 +4276,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB7_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB7_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 @@ -4175,13 +4323,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX7LESS-NEXT: .LBB7_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -4238,9 +4387,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB7_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -4286,8 +4436,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm ; @@ -4344,9 +4495,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB7_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB7_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -4393,8 +4545,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX1064-NEXT: .LBB7_5: ; GFX1064-NEXT: s_endpgm ; @@ -4432,50 +4585,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_min_f64 v[3:4], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB7_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB7_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] ; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 @@ -4486,21 +4640,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s13, s40 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX1032-NEXT: .LBB7_5: ; GFX1032-NEXT: s_endpgm ; @@ -4548,12 +4703,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB7_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB7_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -4598,8 +4754,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -4649,11 +4807,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB7_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB7_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -4697,8 +4856,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_4 ; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -4779,12 +4940,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -4882,8 +5044,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -4929,8 +5092,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[46:47] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[46:47] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -5021,8 +5185,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5069,8 +5234,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -5154,8 +5320,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5201,8 +5368,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -5291,13 +5459,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5342,8 +5511,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -5417,17 +5588,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5468,8 +5641,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -5484,8 +5659,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -5509,10 +5685,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX7LESS-NEXT: .LBB8_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -5521,8 +5698,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -5541,9 +5719,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-NEXT: .LBB8_3: ; GFX9-NEXT: s_endpgm ; @@ -5552,8 +5731,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -5568,8 +5748,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -5583,11 +5764,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 @@ -5608,9 +5790,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1164-NEXT: .LBB8_3: ; GFX1164-NEXT: s_endpgm ; @@ -5618,10 +5801,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 @@ -5639,9 +5823,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm ; @@ -5650,8 +5835,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5675,10 +5861,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX7LESS-DPP-NEXT: .LBB8_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -5687,8 +5874,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -5707,9 +5895,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-DPP-NEXT: .LBB8_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -5718,8 +5907,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5734,8 +5924,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5749,11 +5940,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5774,9 +5966,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1164-DPP-NEXT: .LBB8_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -5784,10 +5977,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5805,9 +5999,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1 @@ -5864,9 +6059,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB9_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB9_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -5889,10 +6085,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB9_4 ; GFX7LESS-NEXT: .LBB9_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -5943,9 +6140,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB9_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB9_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 @@ -5963,9 +6161,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB9_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_4 ; GFX9-NEXT: .LBB9_5: ; GFX9-NEXT: s_endpgm ; @@ -6016,9 +6215,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB9_4 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -6073,9 +6273,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB9_4 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -6122,12 +6323,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB9_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB9_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 @@ -6147,9 +6349,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_4 ; GFX1164-NEXT: .LBB9_5: ; GFX1164-NEXT: s_endpgm ; @@ -6192,11 +6395,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB9_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB9_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 @@ -6215,9 +6419,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_4 ; GFX1132-NEXT: .LBB9_5: ; GFX1132-NEXT: s_endpgm ; @@ -6269,10 +6474,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -6363,11 +6569,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -6384,9 +6591,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -6467,12 +6675,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -6550,12 +6759,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -6640,16 +6850,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -6669,9 +6879,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -6738,17 +6949,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -6766,9 +6979,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -6790,8 +7004,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-NEXT: s_mov_b32 s40, s7 @@ -6844,13 +7059,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -6864,25 +7080,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s48, s48, s9 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start @@ -6923,8 +7140,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm ; @@ -6938,13 +7156,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX1064-NEXT: s_add_u32 s48, s48, s9 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s8 @@ -6998,8 +7217,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm ; @@ -7013,13 +7233,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s8 @@ -7072,8 +7293,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm ; @@ -7081,15 +7303,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-NEXT: s_mov_b32 s33, s8 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 @@ -7139,8 +7362,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1164-NEXT: .LBB10_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -7149,14 +7374,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-NEXT: s_mov_b32 s44, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-NEXT: s_mov_b32 s33, s15 ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -7201,8 +7428,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1132-NEXT: .LBB10_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -7220,8 +7449,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 @@ -7274,13 +7504,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX7LESS-DPP-NEXT: .LBB10_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -7294,25 +7525,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7353,8 +7585,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -7368,13 +7601,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 @@ -7428,8 +7662,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -7443,13 +7678,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 @@ -7502,8 +7738,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -7511,15 +7748,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 @@ -7569,8 +7807,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -7579,14 +7819,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -7631,8 +7873,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -7696,9 +7940,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB11_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB11_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 @@ -7742,13 +7987,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB11_4 ; GFX7LESS-NEXT: .LBB11_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -7805,9 +8051,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB11_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB11_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -7853,8 +8100,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_4 ; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm ; @@ -7911,9 +8159,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB11_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB11_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -7960,8 +8209,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_4 ; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm ; @@ -7999,50 +8249,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_min_f64 v[3:4], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB11_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB11_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] ; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 @@ -8053,21 +8304,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s13, s40 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_4 ; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm ; @@ -8115,12 +8367,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB11_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB11_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -8165,8 +8418,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_4 ; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -8216,11 +8471,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB11_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB11_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -8264,8 +8520,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_4 ; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -8346,12 +8604,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -8449,8 +8708,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8496,8 +8756,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[46:47] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[46:47] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -8588,8 +8849,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8636,8 +8898,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -8721,8 +8984,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8768,8 +9032,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -8858,13 +9123,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8909,8 +9175,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -8984,17 +9252,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9035,8 +9305,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -9051,8 +9323,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -9073,9 +9346,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX7LESS-NEXT: .LBB12_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -9084,8 +9358,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -9102,9 +9377,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX9-NEXT: .LBB12_3: ; GFX9-NEXT: s_endpgm ; @@ -9113,8 +9389,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB12_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -9128,8 +9405,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB12_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -9142,11 +9420,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB12_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -9161,10 +9440,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB12_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -9180,8 +9460,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9202,9 +9483,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX7LESS-DPP-NEXT: .LBB12_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -9213,8 +9495,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -9231,9 +9514,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX9-DPP-NEXT: .LBB12_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -9242,8 +9526,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9257,8 +9542,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9271,11 +9557,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_2 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9290,10 +9577,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -9313,8 +9601,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -9335,9 +9624,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX7LESS-NEXT: .LBB13_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -9346,8 +9636,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -9364,9 +9655,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-NEXT: .LBB13_3: ; GFX9-NEXT: s_endpgm ; @@ -9375,8 +9667,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB13_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -9390,8 +9683,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB13_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -9404,11 +9698,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB13_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -9423,10 +9718,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB13_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 @@ -9442,8 +9738,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9464,9 +9761,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX7LESS-DPP-NEXT: .LBB13_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -9475,8 +9773,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -9493,9 +9792,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-DPP-NEXT: .LBB13_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -9504,8 +9804,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_2 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9519,8 +9820,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_2 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9533,11 +9835,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_2 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9552,10 +9855,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 2165a6ff65e3b5..8f1d4b4dfd3e51 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -22,8 +22,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] @@ -46,9 +47,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX7LESS-NEXT: .LBB0_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -58,8 +60,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -78,9 +81,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-NEXT: .LBB0_3: ; GFX9-NEXT: s_endpgm ; @@ -90,8 +94,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] @@ -111,8 +116,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1064-NEXT: .LBB0_3: ; GFX1064-NEXT: s_endpgm ; @@ -122,8 +128,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 @@ -142,20 +149,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[4:5] @@ -177,9 +187,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1164-NEXT: .LBB0_3: ; GFX1164-NEXT: s_endpgm ; @@ -188,10 +199,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, 0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s5 @@ -211,9 +223,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_endpgm ; @@ -223,8 +236,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] @@ -247,9 +261,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX7LESS-DPP-NEXT: .LBB0_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -259,8 +274,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -279,9 +295,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-DPP-NEXT: .LBB0_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -291,8 +308,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] @@ -312,8 +330,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1064-DPP-NEXT: .LBB0_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -323,8 +342,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 @@ -343,20 +363,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] @@ -378,9 +401,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1164-DPP-NEXT: .LBB0_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -389,10 +413,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 @@ -412,9 +437,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1132-DPP-NEXT: .LBB0_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 @@ -467,9 +493,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -488,9 +515,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX7LESS-NEXT: .LBB1_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -537,9 +565,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -554,9 +583,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX9-NEXT: .LBB1_5: ; GFX9-NEXT: s_endpgm ; @@ -603,9 +633,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB1_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -621,8 +652,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1064-NEXT: .LBB1_5: ; GFX1064-NEXT: s_endpgm ; @@ -669,9 +701,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB1_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -686,8 +719,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1032-NEXT: .LBB1_5: ; GFX1032-NEXT: s_endpgm ; @@ -725,12 +759,13 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB1_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -746,9 +781,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1164-NEXT: .LBB1_5: ; GFX1164-NEXT: s_endpgm ; @@ -787,11 +823,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB1_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -806,9 +843,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1132-NEXT: .LBB1_5: ; GFX1132-NEXT: s_endpgm ; @@ -856,9 +894,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -926,8 +965,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -942,9 +982,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-DPP-NEXT: .LBB1_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -998,18 +1039,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -1025,8 +1069,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1064-DPP-NEXT: .LBB1_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -1079,14 +1124,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -1101,8 +1149,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1032-DPP-NEXT: .LBB1_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -1152,21 +1201,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -1182,9 +1234,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1164-DPP-NEXT: .LBB1_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -1232,16 +1285,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -1256,9 +1312,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1132-DPP-NEXT: .LBB1_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -1279,8 +1336,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -1307,9 +1365,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -1325,8 +1384,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1349,9 +1409,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; @@ -1367,8 +1428,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 @@ -1390,8 +1452,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1064-NEXT: .LBB2_3: ; GFX1064-NEXT: s_endpgm ; @@ -1407,8 +1470,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 @@ -1429,8 +1493,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; @@ -1440,15 +1505,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1471,9 +1537,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1164-NEXT: .LBB2_3: ; GFX1164-NEXT: s_endpgm ; @@ -1484,13 +1551,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1511,9 +1579,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; @@ -1529,8 +1598,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -1557,9 +1627,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX7LESS-DPP-NEXT: .LBB2_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -1575,8 +1646,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1599,9 +1671,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -1617,8 +1690,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -1640,8 +1714,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1064-DPP-NEXT: .LBB2_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -1657,8 +1732,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -1679,8 +1755,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -1690,15 +1767,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1721,9 +1799,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1164-DPP-NEXT: .LBB2_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -1734,13 +1813,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1761,9 +1841,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic @@ -1817,9 +1898,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -1838,9 +1920,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX7LESS-NEXT: .LBB3_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -1887,9 +1970,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -1904,9 +1988,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; @@ -1953,9 +2038,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -1971,8 +2057,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1064-NEXT: .LBB3_5: ; GFX1064-NEXT: s_endpgm ; @@ -2019,9 +2106,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -2036,8 +2124,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1032-NEXT: .LBB3_5: ; GFX1032-NEXT: s_endpgm ; @@ -2075,12 +2164,13 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB3_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -2096,9 +2186,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1164-NEXT: .LBB3_5: ; GFX1164-NEXT: s_endpgm ; @@ -2137,11 +2228,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB3_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -2156,9 +2248,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; @@ -2206,9 +2299,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -2276,8 +2370,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2292,9 +2387,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2348,18 +2444,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2375,8 +2474,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1064-DPP-NEXT: .LBB3_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -2429,14 +2529,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2451,8 +2554,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1032-DPP-NEXT: .LBB3_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -2502,21 +2606,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2532,9 +2639,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1164-DPP-NEXT: .LBB3_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -2582,16 +2690,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2606,9 +2717,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1132-DPP-NEXT: .LBB3_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() strictfp @@ -2629,8 +2741,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -2657,9 +2770,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX7LESS-NEXT: .LBB4_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -2675,8 +2789,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -2699,9 +2814,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-NEXT: .LBB4_3: ; GFX9-NEXT: s_endpgm ; @@ -2717,8 +2833,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 @@ -2740,8 +2857,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1064-NEXT: .LBB4_3: ; GFX1064-NEXT: s_endpgm ; @@ -2757,8 +2875,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 @@ -2779,8 +2898,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; @@ -2790,15 +2910,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2821,9 +2942,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1164-NEXT: .LBB4_3: ; GFX1164-NEXT: s_endpgm ; @@ -2834,13 +2956,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2861,9 +2984,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; @@ -2879,8 +3003,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -2907,9 +3032,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX7LESS-DPP-NEXT: .LBB4_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -2925,8 +3051,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -2949,9 +3076,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-DPP-NEXT: .LBB4_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2967,8 +3095,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -2990,8 +3119,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1064-DPP-NEXT: .LBB4_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -3007,8 +3137,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -3029,8 +3160,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -3040,15 +3172,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -3071,9 +3204,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1164-DPP-NEXT: .LBB4_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -3084,13 +3218,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -3111,9 +3246,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic @@ -3167,9 +3303,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -3188,9 +3325,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX7LESS-NEXT: .LBB5_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -3237,9 +3375,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -3254,9 +3393,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9-NEXT: .LBB5_5: ; GFX9-NEXT: s_endpgm ; @@ -3303,9 +3443,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -3321,8 +3462,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064-NEXT: .LBB5_5: ; GFX1064-NEXT: s_endpgm ; @@ -3369,9 +3511,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -3386,8 +3529,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032-NEXT: .LBB5_5: ; GFX1032-NEXT: s_endpgm ; @@ -3425,12 +3569,13 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB5_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -3446,9 +3591,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1164-NEXT: .LBB5_5: ; GFX1164-NEXT: s_endpgm ; @@ -3487,11 +3633,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB5_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -3506,9 +3653,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132-NEXT: .LBB5_5: ; GFX1132-NEXT: s_endpgm ; @@ -3556,9 +3704,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -3626,8 +3775,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3642,9 +3792,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX9-DPP-NEXT: .LBB5_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -3698,18 +3849,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -3725,8 +3879,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1064-DPP-NEXT: .LBB5_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -3779,14 +3934,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -3801,8 +3959,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1032-DPP-NEXT: .LBB5_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -3852,21 +4011,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -3882,9 +4044,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1164-DPP-NEXT: .LBB5_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -3932,16 +4095,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -3956,9 +4122,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1132-DPP-NEXT: .LBB5_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -4013,9 +4180,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB6_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -4034,9 +4202,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX7LESS-NEXT: .LBB6_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -4083,9 +4252,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB6_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -4100,9 +4270,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB6_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX9-NEXT: .LBB6_5: ; GFX9-NEXT: s_endpgm ; @@ -4149,9 +4320,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB6_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -4167,8 +4339,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX1064-NEXT: .LBB6_5: ; GFX1064-NEXT: s_endpgm ; @@ -4215,9 +4388,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB6_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -4232,8 +4406,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX1032-NEXT: .LBB6_5: ; GFX1032-NEXT: s_endpgm ; @@ -4271,12 +4446,13 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB6_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -4292,9 +4468,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX1164-NEXT: .LBB6_5: ; GFX1164-NEXT: s_endpgm ; @@ -4333,11 +4510,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB6_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -4352,9 +4530,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX1132-NEXT: .LBB6_5: ; GFX1132-NEXT: s_endpgm ; @@ -4402,9 +4581,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -4472,8 +4652,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -4488,9 +4669,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -4544,18 +4726,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -4571,8 +4756,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -4625,14 +4811,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -4647,8 +4836,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -4698,21 +4888,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -4728,9 +4921,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -4778,16 +4972,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -4802,9 +4999,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() strictfp @@ -4825,8 +5023,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -4853,9 +5052,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX7LESS-NEXT: .LBB7_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -4871,8 +5071,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -4895,9 +5096,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX9-NEXT: .LBB7_3: ; GFX9-NEXT: s_endpgm ; @@ -4913,8 +5115,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 @@ -4936,8 +5139,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1064-NEXT: .LBB7_3: ; GFX1064-NEXT: s_endpgm ; @@ -4953,8 +5157,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 @@ -4975,8 +5180,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm ; @@ -4986,15 +5192,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -5017,9 +5224,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1164-NEXT: .LBB7_3: ; GFX1164-NEXT: s_endpgm ; @@ -5030,13 +5238,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -5057,9 +5266,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; @@ -5075,8 +5285,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -5103,9 +5314,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX7LESS-DPP-NEXT: .LBB7_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -5121,8 +5333,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -5145,9 +5358,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -5163,8 +5377,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -5186,8 +5401,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -5203,8 +5419,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -5225,8 +5442,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -5236,15 +5454,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -5267,9 +5486,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -5280,13 +5500,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -5307,9 +5528,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 @@ -5362,9 +5584,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB8_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -5383,9 +5606,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX7LESS-NEXT: .LBB8_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -5432,9 +5656,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB8_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -5449,9 +5674,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB8_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX9-NEXT: .LBB8_5: ; GFX9-NEXT: s_endpgm ; @@ -5498,9 +5724,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB8_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -5516,8 +5743,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1064-NEXT: .LBB8_5: ; GFX1064-NEXT: s_endpgm ; @@ -5564,9 +5792,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB8_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -5581,8 +5810,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1032-NEXT: .LBB8_5: ; GFX1032-NEXT: s_endpgm ; @@ -5620,12 +5850,13 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB8_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -5641,9 +5872,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1164-NEXT: .LBB8_5: ; GFX1164-NEXT: s_endpgm ; @@ -5682,11 +5914,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB8_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -5701,9 +5934,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1132-NEXT: .LBB8_5: ; GFX1132-NEXT: s_endpgm ; @@ -5751,9 +5985,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -5821,8 +6056,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -5837,9 +6073,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-DPP-NEXT: .LBB8_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -5893,18 +6130,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5920,8 +6160,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1064-DPP-NEXT: .LBB8_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -5974,14 +6215,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5996,8 +6240,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -6047,21 +6292,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -6077,9 +6325,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1164-DPP-NEXT: .LBB8_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -6127,16 +6376,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -6151,9 +6403,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() strictfp @@ -6177,8 +6430,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-NEXT: s_mov_b32 s40, s7 @@ -6232,13 +6486,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -6254,14 +6509,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s48, s48, s9 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -6272,9 +6527,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start @@ -6314,8 +6570,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; @@ -6330,13 +6587,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX1064-NEXT: s_mov_b32 s40, s7 @@ -6392,8 +6650,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; @@ -6409,12 +6668,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s8 ; GFX1032-NEXT: s_mov_b32 s40, s7 @@ -6469,8 +6729,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm ; @@ -6480,14 +6741,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 @@ -6540,8 +6802,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1164-NEXT: .LBB9_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -6551,13 +6815,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-NEXT: s_mov_b32 s44, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s6 ; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 @@ -6605,8 +6871,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1132-NEXT: .LBB9_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -6626,8 +6894,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 @@ -6681,13 +6950,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX7LESS-DPP-NEXT: .LBB9_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -6703,14 +6973,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -6721,9 +6991,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start @@ -6763,8 +7034,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -6779,13 +7051,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 @@ -6841,8 +7114,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -6858,12 +7132,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 @@ -6918,8 +7193,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -6929,14 +7205,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 @@ -6989,8 +7266,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -7000,13 +7279,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s6 ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 @@ -7054,8 +7335,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -7117,9 +7400,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB10_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 @@ -7161,13 +7445,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX7LESS-NEXT: .LBB10_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -7222,9 +7507,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB10_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -7268,8 +7554,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB10_4 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm ; @@ -7324,9 +7611,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB10_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -7371,8 +7659,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm ; @@ -7427,9 +7716,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB10_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -7473,8 +7763,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm ; @@ -7520,12 +7811,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB10_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -7568,8 +7860,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -7616,11 +7910,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB10_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -7658,8 +7953,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_4 ; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -7740,13 +8037,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -7837,8 +8135,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -7882,8 +8181,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[46:47] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[46:47] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -7966,8 +8266,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8012,8 +8313,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -8091,8 +8393,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8136,8 +8439,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -8216,13 +8520,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8265,8 +8570,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -8334,15 +8641,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8380,8 +8688,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -8403,8 +8713,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] @@ -8433,10 +8744,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX7LESS-NEXT: .LBB11_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -8452,8 +8764,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -8477,9 +8790,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX9-NEXT: .LBB11_3: ; GFX9-NEXT: s_endpgm ; @@ -8495,8 +8809,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 @@ -8519,8 +8834,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1064-NEXT: .LBB11_3: ; GFX1064-NEXT: s_endpgm ; @@ -8536,8 +8852,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 @@ -8559,8 +8876,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1032-NEXT: .LBB11_3: ; GFX1032-NEXT: s_endpgm ; @@ -8570,15 +8888,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8602,9 +8921,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1164-NEXT: .LBB11_3: ; GFX1164-NEXT: s_endpgm ; @@ -8615,13 +8935,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8642,9 +8963,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm ; @@ -8660,8 +8982,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] @@ -8690,10 +9013,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX7LESS-DPP-NEXT: .LBB11_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -8709,8 +9033,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -8734,9 +9059,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -8752,8 +9078,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -8776,8 +9103,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -8793,8 +9121,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 @@ -8816,8 +9145,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -8827,15 +9157,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8859,9 +9190,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -8872,13 +9204,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8899,9 +9232,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic @@ -8955,9 +9289,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB12_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB12_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -8978,10 +9313,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX7LESS-NEXT: .LBB12_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -9030,9 +9366,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB12_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB12_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 @@ -9048,9 +9385,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB12_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX9-NEXT: .LBB12_5: ; GFX9-NEXT: s_endpgm ; @@ -9099,9 +9437,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB12_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB12_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 @@ -9118,8 +9457,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX1064-NEXT: .LBB12_5: ; GFX1064-NEXT: s_endpgm ; @@ -9168,9 +9508,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB12_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB12_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 @@ -9186,8 +9527,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX1032-NEXT: .LBB12_5: ; GFX1032-NEXT: s_endpgm ; @@ -9227,12 +9569,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB12_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB12_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 @@ -9249,9 +9592,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX1164-NEXT: .LBB12_5: ; GFX1164-NEXT: s_endpgm ; @@ -9291,11 +9635,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB12_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB12_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 @@ -9310,9 +9655,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB12_4 ; GFX1132-NEXT: .LBB12_5: ; GFX1132-NEXT: s_endpgm ; @@ -9362,10 +9708,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -9449,11 +9796,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -9468,9 +9816,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX9-DPP-NEXT: .LBB12_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -9543,12 +9892,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -9565,8 +9915,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1064-DPP-NEXT: .LBB12_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -9633,13 +9984,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -9655,8 +10007,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -9726,16 +10079,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -9752,9 +10105,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1164-DPP-NEXT: .LBB12_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -9815,15 +10169,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -9838,9 +10193,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() strictfp @@ -9861,8 +10217,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] @@ -9891,10 +10248,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX7LESS-NEXT: .LBB13_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -9910,8 +10268,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -9935,9 +10294,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-NEXT: .LBB13_3: ; GFX9-NEXT: s_endpgm ; @@ -9953,8 +10313,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] ; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 @@ -9977,8 +10338,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1064-NEXT: .LBB13_3: ; GFX1064-NEXT: s_endpgm ; @@ -9994,8 +10356,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 @@ -10017,8 +10380,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm ; @@ -10028,15 +10392,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -10060,9 +10425,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1164-NEXT: .LBB13_3: ; GFX1164-NEXT: s_endpgm ; @@ -10073,13 +10439,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -10100,9 +10467,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; @@ -10118,8 +10486,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] @@ -10148,10 +10517,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX7LESS-DPP-NEXT: .LBB13_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -10167,8 +10537,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -10192,9 +10563,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-DPP-NEXT: .LBB13_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -10210,8 +10582,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -10234,8 +10607,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1064-DPP-NEXT: .LBB13_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -10251,8 +10625,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 @@ -10274,8 +10649,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s2, exec_lo, s4 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -10285,15 +10661,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -10317,9 +10694,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1164-DPP-NEXT: .LBB13_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -10330,13 +10708,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -10357,9 +10736,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s2, exec_lo, s4 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s2, s4 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic @@ -10414,9 +10794,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB14_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -10437,10 +10818,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB14_4 ; GFX7LESS-NEXT: .LBB14_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -10489,9 +10871,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB14_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 @@ -10507,9 +10890,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB14_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB14_4 ; GFX9-NEXT: .LBB14_5: ; GFX9-NEXT: s_endpgm ; @@ -10558,9 +10942,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB14_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 @@ -10577,8 +10962,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_4 ; GFX1064-NEXT: .LBB14_5: ; GFX1064-NEXT: s_endpgm ; @@ -10627,9 +11013,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB14_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 @@ -10645,8 +11032,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_4 ; GFX1032-NEXT: .LBB14_5: ; GFX1032-NEXT: s_endpgm ; @@ -10686,12 +11074,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB14_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 @@ -10708,9 +11097,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_4 ; GFX1164-NEXT: .LBB14_5: ; GFX1164-NEXT: s_endpgm ; @@ -10750,11 +11140,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB14_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 @@ -10769,9 +11160,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_4 ; GFX1132-NEXT: .LBB14_5: ; GFX1132-NEXT: s_endpgm ; @@ -10821,10 +11213,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -10908,11 +11301,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -10927,9 +11321,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX9-DPP-NEXT: .LBB14_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -11002,12 +11397,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -11024,8 +11420,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX1064-DPP-NEXT: .LBB14_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -11092,13 +11489,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -11114,8 +11512,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX1032-DPP-NEXT: .LBB14_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -11185,16 +11584,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -11211,9 +11610,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX1164-DPP-NEXT: .LBB14_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -11274,15 +11674,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -11297,9 +11698,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB14_2 ; GFX1132-DPP-NEXT: .LBB14_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -11355,9 +11757,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB15_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB15_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -11378,10 +11781,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX7LESS-NEXT: .LBB15_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -11430,9 +11834,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB15_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB15_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 @@ -11448,9 +11853,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX9-NEXT: .LBB15_5: ; GFX9-NEXT: s_endpgm ; @@ -11499,9 +11905,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB15_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB15_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 @@ -11518,8 +11925,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX1064-NEXT: .LBB15_5: ; GFX1064-NEXT: s_endpgm ; @@ -11568,9 +11976,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB15_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB15_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 @@ -11586,8 +11995,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX1032-NEXT: .LBB15_5: ; GFX1032-NEXT: s_endpgm ; @@ -11627,12 +12037,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB15_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB15_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 @@ -11649,9 +12060,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX1164-NEXT: .LBB15_5: ; GFX1164-NEXT: s_endpgm ; @@ -11691,11 +12103,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB15_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB15_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 @@ -11710,9 +12123,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_4 ; GFX1132-NEXT: .LBB15_5: ; GFX1132-NEXT: s_endpgm ; @@ -11762,10 +12176,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -11849,11 +12264,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB15_3 ; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -11868,9 +12284,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX9-DPP-NEXT: .LBB15_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -11943,12 +12360,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB15_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -11965,8 +12383,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX1064-DPP-NEXT: .LBB15_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -12033,13 +12452,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB15_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -12055,8 +12475,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX1032-DPP-NEXT: .LBB15_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -12126,16 +12547,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB15_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -12152,9 +12573,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX1164-DPP-NEXT: .LBB15_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -12215,15 +12637,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB15_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 @@ -12238,9 +12661,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB15_2 ; GFX1132-DPP-NEXT: .LBB15_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp @@ -12265,10 +12689,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 ; GFX7LESS-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-NEXT: s_mov_b32 s40, s7 @@ -12320,13 +12745,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -12349,8 +12775,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 @@ -12404,8 +12831,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; @@ -12426,8 +12854,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 @@ -12482,8 +12911,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; @@ -12504,8 +12934,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 @@ -12559,8 +12990,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; @@ -12576,7 +13008,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16 ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -12584,8 +13015,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s8 @@ -12636,8 +13069,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1164-NEXT: .LBB16_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -12655,14 +13090,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 @@ -12707,8 +13143,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1132-NEXT: .LBB16_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -12730,10 +13168,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 ; GFX7LESS-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX7LESS-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 @@ -12785,13 +13224,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX7LESS-DPP-NEXT: .LBB16_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -12814,8 +13254,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 @@ -12869,8 +13310,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -12891,8 +13333,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 @@ -12947,8 +13390,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -12969,8 +13413,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 @@ -13024,8 +13469,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -13041,7 +13487,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -13049,8 +13494,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 @@ -13101,8 +13548,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -13120,14 +13569,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 @@ -13172,8 +13622,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -13235,9 +13687,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execz .LBB17_5 +; GFX7LESS-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB17_5 ; GFX7LESS-NEXT: ; %bb.3: ; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 @@ -13279,13 +13732,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB17_4 ; GFX7LESS-NEXT: .LBB17_5: ; GFX7LESS-NEXT: s_endpgm ; @@ -13340,9 +13794,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB17_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB17_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -13386,8 +13841,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB17_4 ; GFX9-NEXT: .LBB17_5: ; GFX9-NEXT: s_endpgm ; @@ -13442,9 +13898,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB17_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB17_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -13489,8 +13946,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_4 ; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm ; @@ -13545,9 +14003,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB17_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB17_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -13591,8 +14050,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_4 ; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm ; @@ -13638,12 +14098,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB17_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB17_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -13686,8 +14147,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_4 ; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -13734,11 +14197,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB17_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB17_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -13776,8 +14240,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_4 ; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -13858,13 +14324,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7LESS-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; @@ -13955,8 +14422,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX9-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB17_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -14000,8 +14468,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[46:47] +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[46:47] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB17_2 ; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -14084,8 +14553,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1064-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB17_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -14130,8 +14600,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB17_2 ; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -14209,8 +14680,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1032-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB17_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -14254,8 +14726,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB17_2 ; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -14334,13 +14807,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -14383,8 +14857,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB17_2 ; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -14452,15 +14928,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -14498,8 +14975,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB17_2 ; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll index f34f9f38feeb4a..08732f5aa9f4fe 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -18,8 +18,9 @@ define amdgpu_ps void @i1_copy_from_loop(ptr addrspace(8) inreg %rsrc, i32 %tid) ; SI-NEXT: s_andn2_b64 s[6:7], s[6:7], exec ; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec ; SI-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB0_7 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; SI-NEXT: s_cbranch_scc0 .LBB0_6 ; SI-NEXT: .LBB0_3: ; %for.body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_cmp_lt_u32 s14, 4 @@ -29,28 +30,29 @@ define amdgpu_ps void @i1_copy_from_loop(ptr addrspace(8) inreg %rsrc, i32 %tid) ; SI-NEXT: s_cbranch_scc1 .LBB0_1 ; SI-NEXT: ; %bb.4: ; %mid.loop ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; SI-NEXT: s_mov_b64 s[12:13], exec ; SI-NEXT: v_mov_b32_e32 v1, s14 ; SI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 idxen offen -; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 +; SI-NEXT: s_mov_b64 s[10:11], -1 +; SI-NEXT: s_and_b64 s[8:9], vcc, -1 ; SI-NEXT: s_mov_b64 s[8:9], -1 -; SI-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB0_2 ; SI-NEXT: ; %bb.5: ; %end.loop ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; SI-NEXT: s_add_i32 s14, s14, 1 ; SI-NEXT: s_xor_b64 s[8:9], exec, -1 -; SI-NEXT: ; %bb.6: ; %Flow1 -; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[12:13] ; SI-NEXT: s_branch .LBB0_2 -; SI-NEXT: .LBB0_7: ; %for.end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_and_saveexec_b64 s[0:1], s[6:7] -; SI-NEXT: s_cbranch_execz .LBB0_9 -; SI-NEXT: ; %bb.8: ; %if +; SI-NEXT: .LBB0_6: ; %for.end +; SI-NEXT: s_and_b64 s[0:1], s[6:7], exec +; SI-NEXT: s_cmov_b64 exec, s[0:1] +; SI-NEXT: s_cbranch_scc0 .LBB0_8 +; SI-NEXT: ; %bb.7: ; %if ; SI-NEXT: exp mrt0 v0, v0, v0, v0 done vm -; SI-NEXT: .LBB0_9: ; %end +; SI-NEXT: .LBB0_8: ; %end ; SI-NEXT: s_endpgm entry: br label %for.body diff --git a/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir b/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir index ac0931b6022f1e..4980fb5ab39eed 100644 --- a/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir +++ b/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir @@ -33,10 +33,9 @@ body: | ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI %15, %bb.6 - ; GCN-NEXT: SI_END_CF [[PHI]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: @@ -45,33 +44,32 @@ body: | ; GCN-NEXT: ATOMIC_FENCE 5, 2 ; GCN-NEXT: S_BARRIER ; GCN-NEXT: ATOMIC_FENCE 4, 2 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY %18 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY %16 ; GCN-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[COPY6]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.7(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.7 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, [[COPY5]], %bb.2 - ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, [[COPY5]], %bb.2 ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.6: ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[PHI2:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_1]], %bb.1, %15, %bb.6 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_1]], %bb.1, %21, %bb.6 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[COPY4]] - ; GCN-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK [[COPY7]], [[PHI2]], implicit-def dead $scc + ; GCN-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK [[COPY7]], [[PHI1]], implicit-def dead $scc ; GCN-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.7: - ; GCN-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.5 @@ -89,7 +87,6 @@ body: | S_BRANCH %bb.1 bb.1: - ; predecessors: %bb.0 successors: %bb.6 %10:sreg_32 = S_MOV_B32 16 @@ -100,17 +97,14 @@ body: | S_BRANCH %bb.6 bb.2: - ; predecessors: %bb.6 successors: %bb.5 - %20:sreg_64 = PHI %6:sreg_64, %bb.6 - SI_END_CF %20:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %15:sreg_64 = S_MOV_B64 -1 %21:vreg_1 = COPY %15:sreg_64, implicit $exec + SI_WAVE_RECONVERGE %16:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.5 bb.3: - ; predecessors: %bb.5 successors: %bb.4, %bb.7 %22:vreg_1 = PHI %7:vreg_1, %bb.5 @@ -122,21 +116,18 @@ body: | S_BRANCH %bb.4 bb.4: - ; predecessors: %bb.3 successors: %bb.7 + SI_WAVE_RECONVERGE %24:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.7 bb.5: - ; predecessors: %bb.0, %bb.2 successors: %bb.3 %7:vreg_1 = PHI %17:vreg_1, %bb.0, %21:vreg_1, %bb.2 - SI_END_CF %16:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 bb.6: - ; predecessors: %bb.1, %bb.6 successors: %bb.2, %bb.6 %5:sreg_64 = PHI %12:sreg_64, %bb.1, %6:sreg_64, %bb.6 @@ -146,9 +137,7 @@ body: | S_BRANCH %bb.2 bb.7: - ; predecessors: %bb.3, %bb.4 - SI_END_CF %24:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 5abd4c9069c919..ac93198c5cb80f 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -6,374 +6,317 @@ define void @main(i1 %arg) #0 { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v8, s30, 0 -; CHECK-NEXT: v_writelane_b32 v8, s31, 1 -; CHECK-NEXT: v_writelane_b32 v8, s36, 2 -; CHECK-NEXT: v_writelane_b32 v8, s37, 3 -; CHECK-NEXT: v_writelane_b32 v8, s38, 4 -; CHECK-NEXT: v_writelane_b32 v8, s39, 5 -; CHECK-NEXT: v_writelane_b32 v8, s40, 6 -; CHECK-NEXT: v_writelane_b32 v8, s41, 7 -; CHECK-NEXT: v_writelane_b32 v8, s42, 8 -; CHECK-NEXT: v_writelane_b32 v8, s43, 9 -; CHECK-NEXT: v_writelane_b32 v8, s44, 10 -; CHECK-NEXT: v_writelane_b32 v8, s45, 11 -; CHECK-NEXT: v_writelane_b32 v8, s46, 12 -; CHECK-NEXT: v_writelane_b32 v8, s47, 13 -; CHECK-NEXT: v_writelane_b32 v8, s48, 14 -; CHECK-NEXT: v_writelane_b32 v8, s49, 15 +; CHECK-NEXT: v_writelane_b32 v7, s30, 0 +; CHECK-NEXT: v_writelane_b32 v7, s31, 1 +; CHECK-NEXT: v_writelane_b32 v7, s36, 2 +; CHECK-NEXT: v_writelane_b32 v7, s37, 3 +; CHECK-NEXT: v_writelane_b32 v7, s38, 4 +; CHECK-NEXT: v_writelane_b32 v7, s39, 5 +; CHECK-NEXT: v_writelane_b32 v7, s40, 6 +; CHECK-NEXT: v_writelane_b32 v7, s41, 7 +; CHECK-NEXT: v_writelane_b32 v7, s42, 8 +; CHECK-NEXT: v_writelane_b32 v7, s43, 9 +; CHECK-NEXT: v_writelane_b32 v7, s44, 10 +; CHECK-NEXT: v_writelane_b32 v7, s45, 11 +; CHECK-NEXT: v_writelane_b32 v7, s46, 12 +; CHECK-NEXT: v_writelane_b32 v7, s47, 13 +; CHECK-NEXT: v_writelane_b32 v7, s48, 14 +; CHECK-NEXT: v_writelane_b32 v7, s49, 15 ; CHECK-NEXT: s_getpc_b64 s[24:25] -; CHECK-NEXT: v_writelane_b32 v8, s50, 16 +; CHECK-NEXT: v_writelane_b32 v7, s50, 16 ; CHECK-NEXT: s_movk_i32 s4, 0xf0 ; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: v_writelane_b32 v8, s51, 17 +; CHECK-NEXT: v_writelane_b32 v7, s51, 17 ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; CHECK-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0 ; CHECK-NEXT: s_movk_i32 s20, 0x130 ; CHECK-NEXT: s_mov_b32 s21, s24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s36, 0 -; CHECK-NEXT: v_writelane_b32 v4, s37, 1 -; CHECK-NEXT: v_writelane_b32 v4, s38, 2 -; CHECK-NEXT: v_writelane_b32 v4, s39, 3 -; CHECK-NEXT: v_writelane_b32 v4, s40, 4 -; CHECK-NEXT: v_writelane_b32 v4, s41, 5 -; CHECK-NEXT: v_writelane_b32 v4, s42, 6 -; CHECK-NEXT: v_writelane_b32 v4, s43, 7 -; CHECK-NEXT: v_writelane_b32 v4, s44, 8 -; CHECK-NEXT: v_writelane_b32 v4, s45, 9 -; CHECK-NEXT: v_writelane_b32 v4, s46, 10 ; CHECK-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 -; CHECK-NEXT: v_writelane_b32 v4, s47, 11 -; CHECK-NEXT: v_writelane_b32 v4, s48, 12 -; CHECK-NEXT: v_writelane_b32 v4, s49, 13 ; CHECK-NEXT: s_mov_b32 s20, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_writelane_b32 v4, s50, 14 -; CHECK-NEXT: v_mov_b32_e32 v5, s28 -; CHECK-NEXT: v_mov_b32_e32 v6, v1 +; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v4, s28 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 ; CHECK-NEXT: s_mov_b32 s21, s20 ; CHECK-NEXT: s_mov_b32 s22, s20 ; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_writelane_b32 v4, s51, 15 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 -; CHECK-NEXT: image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s4, 16 -; CHECK-NEXT: v_writelane_b32 v4, s5, 17 -; CHECK-NEXT: v_writelane_b32 v4, s6, 18 -; CHECK-NEXT: v_writelane_b32 v4, s7, 19 -; CHECK-NEXT: v_writelane_b32 v4, s8, 20 -; CHECK-NEXT: v_writelane_b32 v4, s9, 21 -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[4:11], s[20:23] dmask:0x1 -; CHECK-NEXT: v_writelane_b32 v4, s10, 22 -; CHECK-NEXT: v_writelane_b32 v4, s11, 23 -; CHECK-NEXT: v_writelane_b32 v4, s12, 24 -; CHECK-NEXT: v_writelane_b32 v4, s13, 25 -; CHECK-NEXT: v_writelane_b32 v4, s14, 26 -; CHECK-NEXT: v_writelane_b32 v4, s15, 27 -; CHECK-NEXT: v_writelane_b32 v8, s52, 18 -; CHECK-NEXT: v_writelane_b32 v4, s16, 28 -; CHECK-NEXT: v_writelane_b32 v8, s53, 19 -; CHECK-NEXT: v_writelane_b32 v4, s17, 29 -; CHECK-NEXT: v_writelane_b32 v8, s54, 20 -; CHECK-NEXT: v_writelane_b32 v4, s18, 30 +; CHECK-NEXT: v_writelane_b32 v3, s36, 0 +; CHECK-NEXT: v_writelane_b32 v7, s52, 18 +; CHECK-NEXT: v_writelane_b32 v7, s53, 19 +; CHECK-NEXT: v_writelane_b32 v3, s37, 1 +; CHECK-NEXT: v_writelane_b32 v7, s54, 20 +; CHECK-NEXT: v_writelane_b32 v3, s38, 2 +; CHECK-NEXT: image_sample_lz v4, v[4:5], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: v_writelane_b32 v7, s55, 21 +; CHECK-NEXT: image_sample_lz v5, v[1:2], s[4:11], s[20:23] dmask:0x1 +; CHECK-NEXT: v_writelane_b32 v3, s39, 3 +; CHECK-NEXT: v_writelane_b32 v7, s56, 22 +; CHECK-NEXT: v_writelane_b32 v3, s40, 4 +; CHECK-NEXT: v_writelane_b32 v7, s57, 23 +; CHECK-NEXT: v_writelane_b32 v3, s41, 5 +; CHECK-NEXT: v_writelane_b32 v7, s58, 24 +; CHECK-NEXT: v_writelane_b32 v3, s42, 6 +; CHECK-NEXT: v_writelane_b32 v7, s59, 25 +; CHECK-NEXT: v_writelane_b32 v3, s43, 7 +; CHECK-NEXT: v_writelane_b32 v7, s60, 26 +; CHECK-NEXT: v_writelane_b32 v3, s44, 8 +; CHECK-NEXT: v_writelane_b32 v7, s61, 27 +; CHECK-NEXT: v_writelane_b32 v3, s45, 9 +; CHECK-NEXT: v_writelane_b32 v7, s62, 28 +; CHECK-NEXT: v_writelane_b32 v3, s46, 10 +; CHECK-NEXT: v_writelane_b32 v7, s63, 29 +; CHECK-NEXT: v_writelane_b32 v3, s47, 11 +; CHECK-NEXT: v_writelane_b32 v7, s64, 30 +; CHECK-NEXT: v_writelane_b32 v3, s48, 12 +; CHECK-NEXT: v_writelane_b32 v7, s65, 31 +; CHECK-NEXT: v_writelane_b32 v3, s49, 13 +; CHECK-NEXT: v_writelane_b32 v7, s66, 32 +; CHECK-NEXT: v_writelane_b32 v3, s50, 14 ; CHECK-NEXT: s_mov_b32 s26, 48 -; CHECK-NEXT: s_mov_b32 s27, s24 -; CHECK-NEXT: v_writelane_b32 v8, s55, 21 -; CHECK-NEXT: v_writelane_b32 v4, s19, 31 -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[26:27], 0x0 -; CHECK-NEXT: v_writelane_b32 v8, s56, 22 -; CHECK-NEXT: v_writelane_b32 v8, s57, 23 -; CHECK-NEXT: v_writelane_b32 v8, s58, 24 -; CHECK-NEXT: v_writelane_b32 v8, s59, 25 -; CHECK-NEXT: v_writelane_b32 v8, s60, 26 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s4, 32 -; CHECK-NEXT: v_writelane_b32 v8, s61, 27 -; CHECK-NEXT: v_writelane_b32 v4, s5, 33 -; CHECK-NEXT: v_writelane_b32 v8, s62, 28 -; CHECK-NEXT: v_writelane_b32 v4, s6, 34 -; CHECK-NEXT: v_writelane_b32 v8, s63, 29 -; CHECK-NEXT: v_writelane_b32 v4, s7, 35 -; CHECK-NEXT: v_writelane_b32 v8, s64, 30 -; CHECK-NEXT: v_writelane_b32 v4, s8, 36 -; CHECK-NEXT: v_writelane_b32 v8, s65, 31 -; CHECK-NEXT: v_writelane_b32 v4, s9, 37 -; CHECK-NEXT: v_writelane_b32 v8, s66, 32 ; CHECK-NEXT: s_movk_i32 s28, 0x1f0 -; CHECK-NEXT: s_movk_i32 s30, 0x2f0 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: s_mov_b32 s27, s24 ; CHECK-NEXT: s_mov_b32 s29, s24 +; CHECK-NEXT: v_writelane_b32 v7, s67, 33 +; CHECK-NEXT: v_writelane_b32 v3, s51, 15 +; CHECK-NEXT: s_movk_i32 s30, 0x2f0 ; CHECK-NEXT: s_mov_b32 s31, s24 -; CHECK-NEXT: v_writelane_b32 v4, s10, 38 -; CHECK-NEXT: v_writelane_b32 v8, s67, 33 -; CHECK-NEXT: v_writelane_b32 v4, s11, 39 -; CHECK-NEXT: s_load_dwordx16 s[52:67], s[28:29], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 -; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[26:27], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[52:67], s[28:29], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[36:51], s[30:31], 0x0 ; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1 -; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; CHECK-NEXT: s_and_b64 vcc, s[24:25], exec +; CHECK-NEXT: s_xor_b64 s[26:27], vcc, exec +; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mul_f32_e32 v0, v6, v5 -; CHECK-NEXT: s_and_saveexec_b64 s[26:27], s[24:25] -; CHECK-NEXT: s_xor_b64 s[26:27], exec, s[26:27] -; CHECK-NEXT: s_cbranch_execz .LBB0_3 +; CHECK-NEXT: v_mul_f32_e32 v0, v5, v4 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: v_readlane_b32 s36, v4, 0 -; CHECK-NEXT: v_readlane_b32 s44, v4, 8 -; CHECK-NEXT: v_readlane_b32 s45, v4, 9 -; CHECK-NEXT: v_readlane_b32 s46, v4, 10 -; CHECK-NEXT: v_readlane_b32 s47, v4, 11 -; CHECK-NEXT: v_readlane_b32 s48, v4, 12 -; CHECK-NEXT: v_readlane_b32 s49, v4, 13 -; CHECK-NEXT: v_readlane_b32 s50, v4, 14 -; CHECK-NEXT: v_readlane_b32 s51, v4, 15 -; CHECK-NEXT: s_and_b64 vcc, exec, -1 -; CHECK-NEXT: v_readlane_b32 s37, v4, 1 -; CHECK-NEXT: v_readlane_b32 s38, v4, 2 -; CHECK-NEXT: v_readlane_b32 s39, v4, 3 -; CHECK-NEXT: v_readlane_b32 s40, v4, 4 -; CHECK-NEXT: image_sample_lz v5, v[1:2], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_writelane_b32 v3, s36, 16 +; CHECK-NEXT: v_writelane_b32 v3, s37, 17 +; CHECK-NEXT: v_writelane_b32 v3, s38, 18 +; CHECK-NEXT: v_writelane_b32 v3, s39, 19 +; CHECK-NEXT: v_writelane_b32 v3, s40, 20 +; CHECK-NEXT: v_writelane_b32 v3, s41, 21 +; CHECK-NEXT: v_writelane_b32 v3, s42, 22 +; CHECK-NEXT: v_writelane_b32 v3, s43, 23 +; CHECK-NEXT: v_writelane_b32 v3, s44, 24 +; CHECK-NEXT: v_writelane_b32 v3, s45, 25 +; CHECK-NEXT: v_writelane_b32 v3, s46, 26 +; CHECK-NEXT: v_writelane_b32 v3, s47, 27 +; CHECK-NEXT: v_writelane_b32 v3, s48, 28 +; CHECK-NEXT: v_writelane_b32 v3, s49, 29 +; CHECK-NEXT: v_writelane_b32 v3, s50, 30 +; CHECK-NEXT: v_writelane_b32 v3, s51, 31 +; CHECK-NEXT: v_readlane_b32 s36, v3, 0 +; CHECK-NEXT: v_readlane_b32 s44, v3, 8 +; CHECK-NEXT: v_readlane_b32 s45, v3, 9 +; CHECK-NEXT: v_readlane_b32 s46, v3, 10 +; CHECK-NEXT: v_readlane_b32 s47, v3, 11 +; CHECK-NEXT: v_readlane_b32 s48, v3, 12 +; CHECK-NEXT: v_readlane_b32 s49, v3, 13 +; CHECK-NEXT: v_readlane_b32 s50, v3, 14 +; CHECK-NEXT: v_readlane_b32 s51, v3, 15 +; CHECK-NEXT: v_readlane_b32 s37, v3, 1 +; CHECK-NEXT: v_readlane_b32 s38, v3, 2 +; CHECK-NEXT: v_readlane_b32 s39, v3, 3 +; CHECK-NEXT: v_readlane_b32 s40, v3, 4 +; CHECK-NEXT: v_readlane_b32 s41, v3, 5 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s42, v3, 6 +; CHECK-NEXT: v_readlane_b32 s43, v3, 7 +; CHECK-NEXT: v_readlane_b32 s36, v3, 16 +; CHECK-NEXT: v_readlane_b32 s37, v3, 17 +; CHECK-NEXT: v_readlane_b32 s38, v3, 18 +; CHECK-NEXT: v_readlane_b32 s39, v3, 19 +; CHECK-NEXT: v_readlane_b32 s40, v3, 20 +; CHECK-NEXT: v_readlane_b32 s41, v3, 21 +; CHECK-NEXT: v_readlane_b32 s42, v3, 22 +; CHECK-NEXT: v_readlane_b32 s43, v3, 23 +; CHECK-NEXT: v_readlane_b32 s44, v3, 24 +; CHECK-NEXT: v_readlane_b32 s45, v3, 25 +; CHECK-NEXT: v_readlane_b32 s46, v3, 26 +; CHECK-NEXT: v_readlane_b32 s47, v3, 27 +; CHECK-NEXT: v_readlane_b32 s48, v3, 28 +; CHECK-NEXT: v_readlane_b32 s49, v3, 29 +; CHECK-NEXT: v_readlane_b32 s50, v3, 30 +; CHECK-NEXT: v_readlane_b32 s51, v3, 31 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_readlane_b32 s41, v4, 5 -; CHECK-NEXT: v_readlane_b32 s42, v4, 6 -; CHECK-NEXT: v_readlane_b32 s43, v4, 7 +; CHECK-NEXT: s_and_b64 vcc, exec, -1 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_readlane_b32 s36, v4, 32 -; CHECK-NEXT: v_readlane_b32 s40, v4, 36 -; CHECK-NEXT: v_readlane_b32 s41, v4, 37 -; CHECK-NEXT: v_readlane_b32 s42, v4, 38 -; CHECK-NEXT: v_readlane_b32 s43, v4, 39 ; CHECK-NEXT: s_mov_b32 s21, s20 ; CHECK-NEXT: s_mov_b32 s22, s20 ; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_readlane_b32 s37, v4, 33 -; CHECK-NEXT: v_readlane_b32 s38, v4, 34 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[60:67], s[40:43] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s39, v4, 35 -; CHECK-NEXT: image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1 +; CHECK-NEXT: image_sample_lz v5, v[1:2], s[60:67], s[8:11] dmask:0x1 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[44:51], s[20:23] dmask:0x1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v1, v1, v6 +; CHECK-NEXT: v_sub_f32_e32 v1, v1, v5 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0 -; CHECK-NEXT: v_mul_f32_e32 v1, v1, v5 +; CHECK-NEXT: v_mul_f32_e32 v1, v1, v4 ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 -; CHECK-NEXT: .LBB0_3: ; %Flow14 +; CHECK-NEXT: ; %bb.3: ; %Flow +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_or_b64 exec, exec, s[26:27] +; CHECK-NEXT: .LBB0_4: ; %Flow14 +; CHECK-NEXT: s_xor_b64 s[20:21], s[26:27], exec +; CHECK-NEXT: s_cmp_lg_u64 s[26:27], 0 +; CHECK-NEXT: s_cmov_b64 exec, s[26:27] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_12 +; CHECK-NEXT: ; %bb.5: ; %bb32 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_readlane_b32 s12, v4, 32 -; CHECK-NEXT: v_readlane_b32 s13, v4, 33 -; CHECK-NEXT: v_readlane_b32 s14, v4, 34 -; CHECK-NEXT: v_readlane_b32 s15, v4, 35 -; CHECK-NEXT: v_readlane_b32 s16, v4, 36 -; CHECK-NEXT: v_readlane_b32 s17, v4, 37 -; CHECK-NEXT: v_readlane_b32 s18, v4, 38 -; CHECK-NEXT: v_readlane_b32 s19, v4, 39 -; CHECK-NEXT: v_writelane_b32 v4, s4, 40 -; CHECK-NEXT: v_writelane_b32 v4, s5, 41 -; CHECK-NEXT: v_writelane_b32 v4, s6, 42 -; CHECK-NEXT: v_writelane_b32 v4, s7, 43 -; CHECK-NEXT: v_writelane_b32 v4, s8, 44 -; CHECK-NEXT: v_writelane_b32 v4, s9, 45 -; CHECK-NEXT: v_writelane_b32 v4, s10, 46 -; CHECK-NEXT: v_writelane_b32 v4, s11, 47 -; CHECK-NEXT: v_writelane_b32 v4, s12, 48 -; CHECK-NEXT: v_writelane_b32 v4, s13, 49 -; CHECK-NEXT: v_writelane_b32 v4, s14, 50 -; CHECK-NEXT: v_writelane_b32 v4, s15, 51 -; CHECK-NEXT: v_writelane_b32 v4, s16, 52 -; CHECK-NEXT: v_writelane_b32 v4, s17, 53 -; CHECK-NEXT: v_writelane_b32 v4, s18, 54 -; CHECK-NEXT: v_writelane_b32 v4, s19, 55 -; CHECK-NEXT: v_writelane_b32 v4, s52, 56 -; CHECK-NEXT: v_writelane_b32 v3, s60, 0 -; CHECK-NEXT: v_writelane_b32 v4, s53, 57 -; CHECK-NEXT: v_writelane_b32 v3, s61, 1 -; CHECK-NEXT: v_writelane_b32 v4, s54, 58 -; CHECK-NEXT: v_writelane_b32 v3, s62, 2 -; CHECK-NEXT: v_writelane_b32 v4, s55, 59 -; CHECK-NEXT: v_writelane_b32 v3, s63, 3 -; CHECK-NEXT: v_writelane_b32 v4, s56, 60 -; CHECK-NEXT: v_writelane_b32 v3, s64, 4 -; CHECK-NEXT: v_writelane_b32 v4, s57, 61 -; CHECK-NEXT: v_writelane_b32 v3, s65, 5 -; CHECK-NEXT: v_writelane_b32 v4, s58, 62 -; CHECK-NEXT: v_writelane_b32 v3, s66, 6 -; CHECK-NEXT: v_writelane_b32 v4, s59, 63 -; CHECK-NEXT: v_writelane_b32 v3, s67, 7 -; CHECK-NEXT: s_andn2_saveexec_b64 s[20:21], s[26:27] -; CHECK-NEXT: s_cbranch_execz .LBB0_10 -; CHECK-NEXT: ; %bb.4: ; %bb32 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[24:25] -; CHECK-NEXT: s_xor_b64 s[22:23], exec, s[8:9] -; CHECK-NEXT: s_cbranch_execz .LBB0_6 -; CHECK-NEXT: ; %bb.5: ; %bb43 +; CHECK-NEXT: s_and_b64 s[8:9], s[24:25], exec +; CHECK-NEXT: s_xor_b64 s[22:23], s[8:9], exec +; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 +; CHECK-NEXT: s_cmov_b64 exec, s[8:9] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_7 +; CHECK-NEXT: ; %bb.6: ; %bb43 +; CHECK-NEXT: v_writelane_b32 v3, s36, 16 +; CHECK-NEXT: v_writelane_b32 v3, s37, 17 +; CHECK-NEXT: v_writelane_b32 v3, s38, 18 +; CHECK-NEXT: v_writelane_b32 v3, s39, 19 +; CHECK-NEXT: v_writelane_b32 v3, s40, 20 +; CHECK-NEXT: v_writelane_b32 v3, s41, 21 +; CHECK-NEXT: v_writelane_b32 v3, s42, 22 +; CHECK-NEXT: v_writelane_b32 v3, s43, 23 +; CHECK-NEXT: v_writelane_b32 v3, s44, 24 +; CHECK-NEXT: v_writelane_b32 v3, s45, 25 +; CHECK-NEXT: v_writelane_b32 v3, s46, 26 +; CHECK-NEXT: v_writelane_b32 v3, s47, 27 +; CHECK-NEXT: v_writelane_b32 v3, s48, 28 +; CHECK-NEXT: v_writelane_b32 v3, s49, 29 +; CHECK-NEXT: v_writelane_b32 v3, s50, 30 ; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_writelane_b32 v3, s51, 31 ; CHECK-NEXT: s_mov_b32 s9, s8 ; CHECK-NEXT: v_mov_b32_e32 v0, s8 -; CHECK-NEXT: v_readlane_b32 s36, v4, 0 +; CHECK-NEXT: v_readlane_b32 s36, v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: s_mov_b32 s10, s8 ; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: v_readlane_b32 s37, v4, 1 -; CHECK-NEXT: v_readlane_b32 s38, v4, 2 -; CHECK-NEXT: v_readlane_b32 s39, v4, 3 -; CHECK-NEXT: v_readlane_b32 s40, v4, 4 -; CHECK-NEXT: v_readlane_b32 s41, v4, 5 -; CHECK-NEXT: v_readlane_b32 s42, v4, 6 -; CHECK-NEXT: v_readlane_b32 s43, v4, 7 -; CHECK-NEXT: v_readlane_b32 s44, v4, 8 -; CHECK-NEXT: v_readlane_b32 s45, v4, 9 -; CHECK-NEXT: v_readlane_b32 s46, v4, 10 -; CHECK-NEXT: v_readlane_b32 s47, v4, 11 -; CHECK-NEXT: v_readlane_b32 s48, v4, 12 -; CHECK-NEXT: v_readlane_b32 s49, v4, 13 -; CHECK-NEXT: v_readlane_b32 s50, v4, 14 -; CHECK-NEXT: v_readlane_b32 s51, v4, 15 -; CHECK-NEXT: image_sample_lz v5, v[0:1], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s36, v4, 16 -; CHECK-NEXT: v_readlane_b32 s44, v4, 24 -; CHECK-NEXT: v_readlane_b32 s45, v4, 25 -; CHECK-NEXT: v_readlane_b32 s46, v4, 26 -; CHECK-NEXT: v_readlane_b32 s47, v4, 27 -; CHECK-NEXT: v_readlane_b32 s48, v4, 28 -; CHECK-NEXT: v_readlane_b32 s49, v4, 29 -; CHECK-NEXT: v_readlane_b32 s50, v4, 30 -; CHECK-NEXT: v_readlane_b32 s51, v4, 31 -; CHECK-NEXT: v_mov_b32_e32 v6, 0 -; CHECK-NEXT: v_mov_b32_e32 v7, v6 -; CHECK-NEXT: v_readlane_b32 s37, v4, 17 -; CHECK-NEXT: v_readlane_b32 s38, v4, 18 -; CHECK-NEXT: v_readlane_b32 s39, v4, 19 -; CHECK-NEXT: image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s40, v4, 20 -; CHECK-NEXT: v_readlane_b32 s41, v4, 21 -; CHECK-NEXT: v_readlane_b32 s42, v4, 22 -; CHECK-NEXT: v_readlane_b32 s43, v4, 23 +; CHECK-NEXT: v_readlane_b32 s37, v3, 1 +; CHECK-NEXT: v_readlane_b32 s38, v3, 2 +; CHECK-NEXT: v_readlane_b32 s39, v3, 3 +; CHECK-NEXT: v_readlane_b32 s40, v3, 4 +; CHECK-NEXT: v_readlane_b32 s41, v3, 5 +; CHECK-NEXT: v_readlane_b32 s42, v3, 6 +; CHECK-NEXT: v_readlane_b32 s43, v3, 7 +; CHECK-NEXT: v_readlane_b32 s44, v3, 8 +; CHECK-NEXT: v_readlane_b32 s45, v3, 9 +; CHECK-NEXT: v_readlane_b32 s46, v3, 10 +; CHECK-NEXT: v_readlane_b32 s47, v3, 11 +; CHECK-NEXT: v_readlane_b32 s48, v3, 12 +; CHECK-NEXT: image_sample_lz v4, v[0:1], s[36:43], s[8:11] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s49, v3, 13 +; CHECK-NEXT: image_sample_lz v0, v[0:1], s[12:19], s[4:7] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s50, v3, 14 +; CHECK-NEXT: v_readlane_b32 s51, v3, 15 +; CHECK-NEXT: v_readlane_b32 s36, v3, 16 +; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_readlane_b32 s37, v3, 17 +; CHECK-NEXT: v_readlane_b32 s38, v3, 18 +; CHECK-NEXT: v_readlane_b32 s39, v3, 19 +; CHECK-NEXT: v_readlane_b32 s40, v3, 20 +; CHECK-NEXT: v_readlane_b32 s41, v3, 21 +; CHECK-NEXT: v_readlane_b32 s42, v3, 22 +; CHECK-NEXT: v_readlane_b32 s43, v3, 23 +; CHECK-NEXT: v_mov_b32_e32 v6, v5 +; CHECK-NEXT: v_readlane_b32 s44, v3, 24 +; CHECK-NEXT: v_readlane_b32 s45, v3, 25 +; CHECK-NEXT: v_readlane_b32 s46, v3, 26 +; CHECK-NEXT: v_readlane_b32 s47, v3, 27 +; CHECK-NEXT: v_readlane_b32 s48, v3, 28 +; CHECK-NEXT: v_readlane_b32 s49, v3, 29 +; CHECK-NEXT: v_readlane_b32 s50, v3, 30 +; CHECK-NEXT: v_readlane_b32 s51, v3, 31 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx3 v[5:7], off, s[8:11], 0 +; CHECK-NEXT: buffer_store_dwordx3 v[4:6], off, s[8:11], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: .LBB0_6: ; %Flow12 -; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[22:23] -; CHECK-NEXT: v_readlane_b32 s52, v4, 40 -; CHECK-NEXT: v_readlane_b32 s53, v4, 41 -; CHECK-NEXT: v_readlane_b32 s54, v4, 42 -; CHECK-NEXT: v_readlane_b32 s55, v4, 43 -; CHECK-NEXT: v_readlane_b32 s56, v4, 44 -; CHECK-NEXT: v_readlane_b32 s57, v4, 45 -; CHECK-NEXT: v_readlane_b32 s58, v4, 46 -; CHECK-NEXT: v_readlane_b32 s59, v4, 47 -; CHECK-NEXT: v_readlane_b32 s60, v4, 48 -; CHECK-NEXT: v_readlane_b32 s61, v4, 49 -; CHECK-NEXT: v_readlane_b32 s62, v4, 50 -; CHECK-NEXT: v_readlane_b32 s63, v4, 51 -; CHECK-NEXT: v_readlane_b32 s64, v4, 52 -; CHECK-NEXT: v_readlane_b32 s65, v4, 53 -; CHECK-NEXT: v_readlane_b32 s66, v4, 54 -; CHECK-NEXT: v_readlane_b32 s67, v4, 55 -; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_9 -; CHECK-NEXT: ; %bb.7: ; %bb33.preheader +; CHECK-NEXT: s_or_b64 exec, exec, s[22:23] +; CHECK-NEXT: .LBB0_7: ; %Flow12 +; CHECK-NEXT: s_xor_b64 s[4:5], s[22:23], exec +; CHECK-NEXT: s_cmp_lg_u64 s[22:23], 0 +; CHECK-NEXT: s_cmov_b64 exec, s[22:23] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_11 +; CHECK-NEXT: ; %bb.8: ; %bb33.preheader ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 ; CHECK-NEXT: v_mov_b32_e32 v1, s6 -; CHECK-NEXT: v_readlane_b32 s36, v4, 56 ; CHECK-NEXT: s_mov_b32 s9, s8 ; CHECK-NEXT: s_mov_b32 s10, s8 ; CHECK-NEXT: s_mov_b32 s11, s8 ; CHECK-NEXT: v_mov_b32_e32 v2, s7 -; CHECK-NEXT: v_readlane_b32 s37, v4, 57 -; CHECK-NEXT: v_readlane_b32 s38, v4, 58 -; CHECK-NEXT: v_readlane_b32 s39, v4, 59 -; CHECK-NEXT: v_readlane_b32 s40, v4, 60 -; CHECK-NEXT: v_readlane_b32 s41, v4, 61 -; CHECK-NEXT: v_readlane_b32 s42, v4, 62 -; CHECK-NEXT: v_readlane_b32 s43, v4, 63 -; CHECK-NEXT: s_nop 4 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1 ; CHECK-NEXT: image_sample_lz v5, v[1:2], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[52:59], s[8:11] dmask:0x1 -; CHECK-NEXT: ; kill: killed $vgpr1_vgpr2 -; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37] ; CHECK-NEXT: s_and_b64 vcc, exec, 0 -; CHECK-NEXT: v_readlane_b32 s44, v3, 0 -; CHECK-NEXT: v_readlane_b32 s45, v3, 1 -; CHECK-NEXT: v_readlane_b32 s46, v3, 2 -; CHECK-NEXT: v_readlane_b32 s47, v3, 3 -; CHECK-NEXT: v_readlane_b32 s48, v3, 4 -; CHECK-NEXT: v_readlane_b32 s49, v3, 5 -; CHECK-NEXT: v_readlane_b32 s50, v3, 6 -; CHECK-NEXT: v_readlane_b32 s51, v3, 7 -; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39] -; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41] -; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43] -; CHECK-NEXT: ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 -; CHECK-NEXT: ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 -; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v1, v6, v5 +; CHECK-NEXT: v_sub_f32_e32 v1, v5, v4 ; CHECK-NEXT: v_mul_f32_e32 v0, v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: .LBB0_8: ; %bb33 +; CHECK-NEXT: .LBB0_9: ; %bb33 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_f32_e32 v2, v1, v0 ; CHECK-NEXT: v_sub_f32_e32 v1, v1, v2 ; CHECK-NEXT: s_mov_b64 vcc, vcc -; CHECK-NEXT: s_cbranch_vccz .LBB0_8 -; CHECK-NEXT: .LBB0_9: ; %Flow13 +; CHECK-NEXT: s_cbranch_vccz .LBB0_9 +; CHECK-NEXT: ; %bb.10: ; %Flow11 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock +; CHECK-NEXT: .LBB0_11: ; %Flow13 ; CHECK-NEXT: s_or_b64 exec, exec, s[20:21] -; CHECK-NEXT: v_readlane_b32 s67, v8, 33 -; CHECK-NEXT: v_readlane_b32 s66, v8, 32 -; CHECK-NEXT: v_readlane_b32 s65, v8, 31 -; CHECK-NEXT: v_readlane_b32 s64, v8, 30 -; CHECK-NEXT: v_readlane_b32 s63, v8, 29 -; CHECK-NEXT: v_readlane_b32 s62, v8, 28 -; CHECK-NEXT: v_readlane_b32 s61, v8, 27 -; CHECK-NEXT: v_readlane_b32 s60, v8, 26 -; CHECK-NEXT: v_readlane_b32 s59, v8, 25 -; CHECK-NEXT: v_readlane_b32 s58, v8, 24 -; CHECK-NEXT: v_readlane_b32 s57, v8, 23 -; CHECK-NEXT: v_readlane_b32 s56, v8, 22 -; CHECK-NEXT: v_readlane_b32 s55, v8, 21 -; CHECK-NEXT: v_readlane_b32 s54, v8, 20 -; CHECK-NEXT: v_readlane_b32 s53, v8, 19 -; CHECK-NEXT: v_readlane_b32 s52, v8, 18 -; CHECK-NEXT: v_readlane_b32 s51, v8, 17 -; CHECK-NEXT: v_readlane_b32 s50, v8, 16 -; CHECK-NEXT: v_readlane_b32 s49, v8, 15 -; CHECK-NEXT: v_readlane_b32 s48, v8, 14 -; CHECK-NEXT: v_readlane_b32 s47, v8, 13 -; CHECK-NEXT: v_readlane_b32 s46, v8, 12 -; CHECK-NEXT: v_readlane_b32 s45, v8, 11 -; CHECK-NEXT: v_readlane_b32 s44, v8, 10 -; CHECK-NEXT: v_readlane_b32 s43, v8, 9 -; CHECK-NEXT: v_readlane_b32 s42, v8, 8 -; CHECK-NEXT: v_readlane_b32 s41, v8, 7 -; CHECK-NEXT: v_readlane_b32 s40, v8, 6 -; CHECK-NEXT: v_readlane_b32 s39, v8, 5 -; CHECK-NEXT: v_readlane_b32 s38, v8, 4 -; CHECK-NEXT: v_readlane_b32 s37, v8, 3 -; CHECK-NEXT: v_readlane_b32 s36, v8, 2 -; CHECK-NEXT: v_readlane_b32 s31, v8, 1 -; CHECK-NEXT: v_readlane_b32 s30, v8, 0 -; CHECK-NEXT: ; kill: killed $vgpr4 +; CHECK-NEXT: .LBB0_12: ; %UnifiedReturnBlock +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_readlane_b32 s67, v7, 33 +; CHECK-NEXT: v_readlane_b32 s66, v7, 32 +; CHECK-NEXT: v_readlane_b32 s65, v7, 31 +; CHECK-NEXT: v_readlane_b32 s64, v7, 30 +; CHECK-NEXT: v_readlane_b32 s63, v7, 29 +; CHECK-NEXT: v_readlane_b32 s62, v7, 28 +; CHECK-NEXT: v_readlane_b32 s61, v7, 27 +; CHECK-NEXT: v_readlane_b32 s60, v7, 26 +; CHECK-NEXT: v_readlane_b32 s59, v7, 25 +; CHECK-NEXT: v_readlane_b32 s58, v7, 24 +; CHECK-NEXT: v_readlane_b32 s57, v7, 23 +; CHECK-NEXT: v_readlane_b32 s56, v7, 22 +; CHECK-NEXT: v_readlane_b32 s55, v7, 21 +; CHECK-NEXT: v_readlane_b32 s54, v7, 20 +; CHECK-NEXT: v_readlane_b32 s53, v7, 19 +; CHECK-NEXT: v_readlane_b32 s52, v7, 18 +; CHECK-NEXT: v_readlane_b32 s51, v7, 17 +; CHECK-NEXT: v_readlane_b32 s50, v7, 16 +; CHECK-NEXT: v_readlane_b32 s49, v7, 15 +; CHECK-NEXT: v_readlane_b32 s48, v7, 14 +; CHECK-NEXT: v_readlane_b32 s47, v7, 13 +; CHECK-NEXT: v_readlane_b32 s46, v7, 12 +; CHECK-NEXT: v_readlane_b32 s45, v7, 11 +; CHECK-NEXT: v_readlane_b32 s44, v7, 10 +; CHECK-NEXT: v_readlane_b32 s43, v7, 9 +; CHECK-NEXT: v_readlane_b32 s42, v7, 8 +; CHECK-NEXT: v_readlane_b32 s41, v7, 7 +; CHECK-NEXT: v_readlane_b32 s40, v7, 6 +; CHECK-NEXT: v_readlane_b32 s39, v7, 5 +; CHECK-NEXT: v_readlane_b32 s38, v7, 4 +; CHECK-NEXT: v_readlane_b32 s37, v7, 3 +; CHECK-NEXT: v_readlane_b32 s36, v7, 2 +; CHECK-NEXT: v_readlane_b32 s31, v7, 1 +; CHECK-NEXT: v_readlane_b32 s30, v7, 0 ; CHECK-NEXT: ; kill: killed $vgpr3 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index a33142fd0ab1f3..18d85176d466ae 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1610,9 +1610,6 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 ; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload @@ -1631,6 +1628,9 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 @@ -1660,12 +1660,12 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(1) ; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 ; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 ; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 ; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 -; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; NOOPT-NEXT: ; kill: killed $vgpr0 @@ -4137,9 +4137,6 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload @@ -4159,6 +4156,9 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 @@ -4221,11 +4221,6 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 -; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 -; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 ; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload @@ -4242,6 +4237,11 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 +; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_mov_b32_e32 v5, v19 ; NOOPT-NEXT: v_mov_b32_e32 v6, v18 @@ -4624,9 +4624,6 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload @@ -4646,6 +4643,9 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 @@ -4708,11 +4708,6 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 -; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 -; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 ; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload @@ -4729,6 +4724,11 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 +; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_mov_b32_e32 v5, v19 ; NOOPT-NEXT: v_mov_b32_e32 v6, v18 @@ -4950,90 +4950,91 @@ entry: define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) { ; GENERIC-LABEL: extract_vgpr_offset_multiple_in_block: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GENERIC-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GENERIC-NEXT: s_mov_b32 s11, 0xf000 -; GENERIC-NEXT: s_mov_b32 s6, 0 +; GENERIC-NEXT: s_mov_b32 s2, 0 ; GENERIC-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GENERIC-NEXT: v_mov_b32_e32 v2, 0 -; GENERIC-NEXT: s_mov_b32 s7, s11 +; GENERIC-NEXT: s_mov_b32 s3, s11 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 glc +; GENERIC-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 ; GENERIC-NEXT: s_mov_b32 s10, -1 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GENERIC-NEXT: ;;#ASMSTART ; GENERIC-NEXT: s_mov_b32 s4, 17 ; GENERIC-NEXT: ;;#ASMEND -; GENERIC-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GENERIC-NEXT: v_cndmask_b32_e64 v3, 7, 9, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 11, v3, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GENERIC-NEXT: v_cndmask_b32_e64 v4, 7, 9, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 13, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 11, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 5, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 13, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 6, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 7, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 6, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 8, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 7, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 9, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 8, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 10, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 9, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 11, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 10, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 12, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 11, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 13, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 12, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 14, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 13, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 15, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 14, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v1 -; GENERIC-NEXT: v_cndmask_b32_e32 v1, 16, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 15, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, 16, v3, vcc -; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: v_add_i32_e64 v0, s[0:1], 1, v1 +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 7, 9, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 11, v2, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 7, 9, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 13, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 5, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 6, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 5, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 7, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 6, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 8, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 7, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 9, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 8, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 10, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 9, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 11, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 10, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 12, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 13, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 12, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 14, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 15, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 14, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v1, 16, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 15, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v0, 16, v2, s[0:1] ; GENERIC-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; GENERIC-NEXT: s_and_b64 s[0:1], vcc, -1 +; GENERIC-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GENERIC-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GENERIC-NEXT: s_cbranch_execz .LBB16_2 +; GENERIC-NEXT: s_cmov_b64 exec, vcc +; GENERIC-NEXT: s_cbranch_scc0 .LBB16_2 ; GENERIC-NEXT: ; %bb.1: ; %bb1 +; GENERIC-NEXT: s_waitcnt expcnt(0) ; GENERIC-NEXT: v_mov_b32_e32 v0, s4 ; GENERIC-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GENERIC-NEXT: s_waitcnt vmcnt(0) @@ -5183,9 +5184,6 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 23 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 24 ; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:8 ; 4-byte Folded Reload @@ -5204,6 +5202,9 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 23 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 24 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 @@ -5301,9 +5302,6 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 28 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 29 ; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:88 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:92 ; 4-byte Folded Reload @@ -5322,6 +5320,9 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:144 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:148 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 28 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 29 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 @@ -5351,59 +5352,59 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 ; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(3) ; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 ; NOOPT-NEXT: v_readlane_b32 s4, v0, 0 ; NOOPT-NEXT: v_readlane_b32 s5, v0, 1 ; NOOPT-NEXT: v_readlane_b32 s6, v0, 2 ; NOOPT-NEXT: v_readlane_b32 s7, v0, 3 -; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: buffer_store_dword v3, off, s[4:7], 0 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 -; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v0, s0, 30 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 31 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, s0 +; NOOPT-NEXT: s_mov_b64 s[2:3], exec +; NOOPT-NEXT: v_writelane_b32 v0, s2, 30 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 31 ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 ; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] -; NOOPT-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; NOOPT-NEXT: s_mov_b64 exec, s[0:1] -; NOOPT-NEXT: s_cbranch_execz .LBB16_8 -; NOOPT-NEXT: ; %bb.7: ; %bb1 +; NOOPT-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; NOOPT-NEXT: s_cmov_b64 exec, s[0:1] +; NOOPT-NEXT: s_cbranch_scc1 .LBB16_7 +; NOOPT-NEXT: s_branch .LBB16_8 +; NOOPT-NEXT: .LBB16_7: ; %bb1 ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s4, v0, 25 -; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 -; NOOPT-NEXT: s_mov_b32 s7, s1 -; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 -; NOOPT-NEXT: s_mov_b32 s5, 0xf000 -; NOOPT-NEXT: s_mov_b32 s6, -1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 -; NOOPT-NEXT: s_mov_b32 s1, s7 -; NOOPT-NEXT: s_mov_b32 s2, s6 -; NOOPT-NEXT: s_mov_b32 s3, s5 -; NOOPT-NEXT: v_mov_b32_e32 v0, s4 -; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: v_readlane_b32 s0, v0, 30 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 31 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 25 +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5 +; NOOPT-NEXT: s_mov_b32 s9, s5 +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; NOOPT-NEXT: s_mov_b32 s3, 0xf000 +; NOOPT-NEXT: s_mov_b32 s8, -1 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s5, s9 +; NOOPT-NEXT: s_mov_b32 s6, s8 +; NOOPT-NEXT: s_mov_b32 s7, s3 +; NOOPT-NEXT: v_mov_b32_e32 v0, s2 +; NOOPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_or_b64 exec, exec, s[0:1] ; NOOPT-NEXT: .LBB16_8: ; %bb2 ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 30 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 31 -; NOOPT-NEXT: s_or_b64 exec, exec, s[0:1] ; NOOPT-NEXT: ; kill: killed $vgpr0 ; NOOPT-NEXT: s_endpgm ; @@ -5420,6 +5421,7 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; SI-MOVREL-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-MOVREL-NEXT: s_and_b64 s[0:1], vcc, -1 ; SI-MOVREL-NEXT: s_mov_b32 s10, -1 ; SI-MOVREL-NEXT: ;;#ASMSTART ; SI-MOVREL-NEXT: s_mov_b32 s4, 17 @@ -5489,8 +5491,8 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-MOVREL-NEXT: s_cbranch_execz .LBB16_2 +; SI-MOVREL-NEXT: s_cmov_b64 exec, vcc +; SI-MOVREL-NEXT: s_cbranch_scc0 .LBB16_2 ; SI-MOVREL-NEXT: ; %bb.1: ; %bb1 ; SI-MOVREL-NEXT: s_waitcnt expcnt(0) ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 @@ -5511,6 +5513,7 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: s_cmp_lg_u64 vcc, 0 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: s_mov_b32 s4, 17 ; VI-NEXT: ;;#ASMEND @@ -5582,8 +5585,8 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB16_2 ; VI-NEXT: ; %bb.1: ; %bb1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: flat_store_dword v[0:1], v0 @@ -5597,6 +5600,7 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 ; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-IDXMODE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-IDXMODE-NEXT: global_load_dword v2, v1, s[0:1] glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) @@ -5669,8 +5673,8 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; GFX9-IDXMODE-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-IDXMODE-NEXT: s_cmov_b64 exec, vcc +; GFX9-IDXMODE-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-IDXMODE-NEXT: global_store_dword v[0:1], v0, off @@ -5702,116 +5706,116 @@ bb2: define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) { ; GENERIC-LABEL: insert_vgpr_offset_multiple_in_block: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[24:25], s[2:3], 0xd -; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; GENERIC-NEXT: s_mov_b32 s23, 0xf000 -; GENERIC-NEXT: s_mov_b32 s26, 0 +; GENERIC-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0xd +; GENERIC-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x19 +; GENERIC-NEXT: s_mov_b32 s19, 0xf000 +; GENERIC-NEXT: s_mov_b32 s18, -1 +; GENERIC-NEXT: s_mov_b32 s22, 0 ; GENERIC-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GENERIC-NEXT: v_mov_b32_e32 v2, 0 -; GENERIC-NEXT: s_mov_b32 s27, s23 +; GENERIC-NEXT: s_mov_b32 s23, s19 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: buffer_load_dword v2, v[1:2], s[24:27], 0 addr64 glc +; GENERIC-NEXT: buffer_load_dword v1, v[1:2], s[20:23], 0 addr64 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; GENERIC-NEXT: s_mov_b32 s22, -1 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GENERIC-NEXT: s_and_b64 s[20:21], vcc, -1 ; GENERIC-NEXT: ;;#ASMSTART -; GENERIC-NEXT: v_mov_b32 v1, 62 +; GENERIC-NEXT: v_mov_b32 v0, 62 ; GENERIC-NEXT: ;;#ASMEND -; GENERIC-NEXT: v_mov_b32_e32 v3, s16 -; GENERIC-NEXT: v_mov_b32_e32 v4, s17 -; GENERIC-NEXT: v_mov_b32_e32 v5, s18 -; GENERIC-NEXT: v_mov_b32_e32 v6, s19 -; GENERIC-NEXT: v_mov_b32_e32 v7, s12 -; GENERIC-NEXT: v_mov_b32_e32 v8, s13 -; GENERIC-NEXT: v_mov_b32_e32 v9, s14 -; GENERIC-NEXT: v_mov_b32_e32 v10, s15 -; GENERIC-NEXT: v_mov_b32_e32 v11, s8 -; GENERIC-NEXT: v_mov_b32_e32 v12, s9 -; GENERIC-NEXT: v_mov_b32_e32 v13, s10 -; GENERIC-NEXT: v_mov_b32_e32 v14, s11 -; GENERIC-NEXT: v_mov_b32_e32 v15, s4 -; GENERIC-NEXT: v_mov_b32_e32 v16, s5 -; GENERIC-NEXT: v_mov_b32_e32 v17, s6 -; GENERIC-NEXT: v_mov_b32_e32 v18, s7 -; GENERIC-NEXT: v_add_i32_e32 v19, vcc, 1, v2 -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc -; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 +; GENERIC-NEXT: v_mov_b32_e32 v2, s12 +; GENERIC-NEXT: v_mov_b32_e32 v3, s13 +; GENERIC-NEXT: v_mov_b32_e32 v4, s14 +; GENERIC-NEXT: v_mov_b32_e32 v5, s15 +; GENERIC-NEXT: v_mov_b32_e32 v6, s8 +; GENERIC-NEXT: v_mov_b32_e32 v7, s9 +; GENERIC-NEXT: v_mov_b32_e32 v8, s10 +; GENERIC-NEXT: v_mov_b32_e32 v9, s11 +; GENERIC-NEXT: v_mov_b32_e32 v10, s4 +; GENERIC-NEXT: v_mov_b32_e32 v11, s5 +; GENERIC-NEXT: v_mov_b32_e32 v12, s6 +; GENERIC-NEXT: v_mov_b32_e32 v13, s7 +; GENERIC-NEXT: v_mov_b32_e32 v14, s0 +; GENERIC-NEXT: v_mov_b32_e32 v15, s1 +; GENERIC-NEXT: v_mov_b32_e32 v16, s2 +; GENERIC-NEXT: v_mov_b32_e32 v17, s3 +; GENERIC-NEXT: v_add_i32_e64 v18, s[0:1], 1, v1 +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 12, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v19, v2, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 13, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v20, v3, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 14, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v21, v4, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 15, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v22, v5, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 8, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v23, v6, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 9, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v24, v7, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 10, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v25, v8, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 11, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 4, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v5, v10, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 5, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v6, v11, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 6, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v7, v12, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 7, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v8, v13, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v10, v14, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, v15, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, v16, v0, s[0:1] +; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v1, v17, v0, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v4, 63, v1, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 63, v3, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 63, v2, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v1, 63, v10, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v8, 63, v8, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v7, 63, v7, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v6, 63, v6, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v5, 63, v5, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v12, 63, v9, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v11, 63, v25, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v10, 63, v24, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v9, 63, v23, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v16, 63, v22, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v15, 63, v21, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v14, 63, v20, s[0:1] +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v13, 63, v19, s[0:1] +; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[16:19], 0 offset:48 ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 +; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[16:19], 0 offset:32 ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16 +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[16:19], 0 offset:16 ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_store_dwordx4 v[2:5], off, s[20:23], 0 +; GENERIC-NEXT: buffer_store_dwordx4 v[1:4], off, s[16:19], 0 ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GENERIC-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GENERIC-NEXT: s_cbranch_execz .LBB17_2 +; GENERIC-NEXT: s_cmov_b64 exec, vcc +; GENERIC-NEXT: s_cbranch_scc0 .LBB17_2 ; GENERIC-NEXT: ; %bb.1: ; %bb1 -; GENERIC-NEXT: buffer_store_dword v1, off, s[20:23], 0 +; GENERIC-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; GENERIC-NEXT: s_waitcnt vmcnt(0) ; GENERIC-NEXT: .LBB17_2: ; %bb2 ; GENERIC-NEXT: s_endpgm @@ -5915,9 +5919,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 7 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 8 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload @@ -5937,6 +5938,9 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 7 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 8 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 @@ -6047,9 +6051,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 11 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 12 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload @@ -6069,6 +6070,9 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 11 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 12 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 @@ -6131,12 +6135,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 ; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 -; NOOPT-NEXT: v_readlane_b32 s4, v0, 0 -; NOOPT-NEXT: v_readlane_b32 s5, v0, 1 -; NOOPT-NEXT: v_readlane_b32 s6, v0, 2 -; NOOPT-NEXT: v_readlane_b32 s7, v0, 3 ; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload @@ -6154,6 +6152,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_load_dword v30, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v31, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 offset:280 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s4, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s5, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s6, v0, 2 +; NOOPT-NEXT: v_readlane_b32 s7, v0, 3 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_mov_b32_e32 v6, v20 ; NOOPT-NEXT: v_mov_b32_e32 v7, v19 @@ -6212,41 +6216,44 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: v_mov_b32_e32 v5, v6 ; NOOPT-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 -; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v0, s0, 13 -; NOOPT-NEXT: v_writelane_b32 v0, s1, 14 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, s0 +; NOOPT-NEXT: s_mov_b64 s[2:3], exec +; NOOPT-NEXT: v_writelane_b32 v0, s2, 13 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 14 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 ; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] -; NOOPT-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; NOOPT-NEXT: s_mov_b64 exec, s[0:1] -; NOOPT-NEXT: s_cbranch_execz .LBB17_8 -; NOOPT-NEXT: ; %bb.7: ; %bb1 +; NOOPT-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; NOOPT-NEXT: s_cmov_b64 exec, s[0:1] +; NOOPT-NEXT: s_cbranch_scc1 .LBB17_7 +; NOOPT-NEXT: s_branch .LBB17_8 +; NOOPT-NEXT: .LBB17_7: ; %bb1 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:68 ; 4-byte Folded Reload -; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 -; NOOPT-NEXT: s_mov_b32 s6, s1 -; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 -; NOOPT-NEXT: s_mov_b32 s4, 0xf000 -; NOOPT-NEXT: s_mov_b32 s5, -1 -; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 -; NOOPT-NEXT: s_mov_b32 s1, s6 -; NOOPT-NEXT: s_mov_b32 s2, s5 -; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: v_readlane_b32 s0, v1, 13 +; NOOPT-NEXT: v_readlane_b32 s1, v1, 14 +; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s8, s3 +; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s4, s2 +; NOOPT-NEXT: s_mov_b32 s2, 0xf000 +; NOOPT-NEXT: s_mov_b32 s3, -1 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s5, s8 +; NOOPT-NEXT: s_mov_b32 s6, s3 +; NOOPT-NEXT: s_mov_b32 s7, s2 +; NOOPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_or_b64 exec, exec, s[0:1] ; NOOPT-NEXT: .LBB17_8: ; %bb2 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[26:27] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 13 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 14 -; NOOPT-NEXT: s_or_b64 exec, exec, s[0:1] ; NOOPT-NEXT: ; kill: killed $vgpr0 ; NOOPT-NEXT: s_endpgm ; @@ -6350,6 +6357,7 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-MOVREL-NEXT: s_and_b64 s[0:1], vcc, -1 ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 @@ -6358,8 +6366,8 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[2:5], off, s[20:23], 0 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-MOVREL-NEXT: s_cbranch_execz .LBB17_2 +; SI-MOVREL-NEXT: s_cmov_b64 exec, vcc +; SI-MOVREL-NEXT: s_cbranch_scc0 .LBB17_2 ; SI-MOVREL-NEXT: ; %bb.1: ; %bb1 ; SI-MOVREL-NEXT: buffer_store_dword v1, off, s[20:23], 0 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) @@ -6478,6 +6486,7 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_cmp_lg_u64 vcc, 0 ; VI-NEXT: v_mov_b32_e32 v11, s3 ; VI-NEXT: v_mov_b32_e32 v10, s2 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[6:9] @@ -6486,8 +6495,8 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_store_dwordx4 v[6:7], v[2:5] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB17_2 ; VI-NEXT: ; %bb.1: ; %bb1 ; VI-NEXT: flat_store_dword v[0:1], v1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6590,6 +6599,7 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v20 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-IDXMODE-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[15:18], s[0:1] offset:48 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[11:14], s[0:1] offset:32 @@ -6598,8 +6608,8 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[3:6], s[0:1] ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-IDXMODE-NEXT: s_cmov_b64 exec, vcc +; GFX9-IDXMODE-NEXT: s_cbranch_scc0 .LBB17_2 ; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb1 ; GFX9-IDXMODE-NEXT: global_store_dword v[0:1], v1, off ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) @@ -9094,9 +9104,9 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s2, v0, 0 ; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(1) +; NOOPT-NEXT: v_readlane_b32 s2, v0, 0 ; NOOPT-NEXT: s_mov_b64 s[0:1], -1 ; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: s_waitcnt vmcnt(0) @@ -9181,9 +9191,6 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload @@ -9203,6 +9210,9 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 @@ -9291,21 +9301,19 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] ; NOOPT-NEXT: .LBB26_6: ; %Flow ; NOOPT-NEXT: ; in Loop: Header=BB26_1 Depth=1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 -; NOOPT-NEXT: s_waitcnt expcnt(1) ; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v1, 2 ; NOOPT-NEXT: v_readlane_b32 s1, v1, 3 -; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload ; NOOPT-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; NOOPT-NEXT: s_mov_b32 s0, 1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 ; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 ; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1] -; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_cbranch_vccnz .LBB26_1 ; NOOPT-NEXT: ; %bb.7: ; %bb8 @@ -9678,9 +9686,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 9 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 10 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload @@ -9700,6 +9705,9 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_load_dword v16, off, s[16:19], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v18, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 9 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 10 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 @@ -9762,11 +9770,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 ; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] -; NOOPT-NEXT: s_waitcnt vmcnt(0) -; NOOPT-NEXT: v_readlane_b32 s0, v0, 3 -; NOOPT-NEXT: v_readlane_b32 s1, v0, 4 -; NOOPT-NEXT: v_readlane_b32 s2, v0, 5 -; NOOPT-NEXT: v_readlane_b32 s3, v0, 6 ; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v18, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload @@ -9785,6 +9788,11 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_load_dword v31, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v32, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 3 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 5 +; NOOPT-NEXT: v_readlane_b32 s3, v0, 6 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_mov_b32_e32 v7, v21 ; NOOPT-NEXT: v_mov_b32_e32 v8, v20 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index da8aa544698355..72afe23b32bd39 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -171,10 +171,10 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] -; GCN-NEXT: s_cbranch_execnz .LBB2_1 +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[48:49] +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[46:47] +; GCN-NEXT: s_cbranch_scc1 .LBB2_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[46:47] ; GCN-NEXT: v_readlane_b32 s49, v40, 17 ; GCN-NEXT: v_readlane_b32 s48, v40, 16 ; GCN-NEXT: v_readlane_b32 s47, v40, 15 @@ -210,7 +210,7 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v40, s16, 18 +; GISEL-NEXT: v_writelane_b32 v40, s16, 16 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -228,8 +228,6 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s45, 13 ; GISEL-NEXT: v_writelane_b32 v40, s46, 14 ; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 ; GISEL-NEXT: s_mov_b32 s42, s15 ; GISEL-NEXT: s_mov_b32 s43, s14 ; GISEL-NEXT: s_mov_b32 s44, s13 @@ -238,12 +236,11 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] ; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] -; GISEL-NEXT: s_mov_b64 s[46:47], exec ; GISEL-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[46:47], vcc ; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] @@ -255,12 +252,10 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49] -; GISEL-NEXT: s_cbranch_execnz .LBB2_1 +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[46:47] +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[46:47] +; GISEL-NEXT: s_cbranch_scc1 .LBB2_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[46:47] -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 ; GISEL-NEXT: v_readlane_b32 s47, v40, 15 ; GISEL-NEXT: v_readlane_b32 s46, v40, 14 ; GISEL-NEXT: v_readlane_b32 s45, v40, 13 @@ -277,7 +272,7 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v40, 18 +; GISEL-NEXT: v_readlane_b32 s4, v40, 16 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] @@ -346,10 +341,10 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 ; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] -; GCN-NEXT: s_cbranch_execnz .LBB3_1 +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[48:49] +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[46:47] +; GCN-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[46:47] ; GCN-NEXT: v_readlane_b32 s49, v40, 17 ; GCN-NEXT: v_readlane_b32 s48, v40, 16 ; GCN-NEXT: v_readlane_b32 s47, v40, 15 @@ -385,7 +380,7 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v40, s16, 18 +; GISEL-NEXT: v_writelane_b32 v40, s16, 16 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -403,8 +398,6 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s45, 13 ; GISEL-NEXT: v_writelane_b32 v40, s46, 14 ; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 ; GISEL-NEXT: s_mov_b32 s42, s15 ; GISEL-NEXT: s_mov_b32 s43, s14 ; GISEL-NEXT: s_mov_b32 s44, s13 @@ -413,12 +406,11 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] ; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] -; GISEL-NEXT: s_mov_b64 s[46:47], exec ; GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[46:47], vcc ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -431,12 +423,10 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49] -; GISEL-NEXT: s_cbranch_execnz .LBB3_1 +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[46:47] +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[46:47] +; GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[46:47] -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 ; GISEL-NEXT: v_readlane_b32 s47, v40, 15 ; GISEL-NEXT: v_readlane_b32 s46, v40, 14 ; GISEL-NEXT: v_readlane_b32 s45, v40, 13 @@ -453,7 +443,7 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v40, 18 +; GISEL-NEXT: v_readlane_b32 s4, v40, 16 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] @@ -520,10 +510,10 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] -; GCN-NEXT: s_cbranch_execnz .LBB4_1 +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[48:49] +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[46:47] +; GCN-NEXT: s_cbranch_scc1 .LBB4_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[46:47] ; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v2 ; GCN-NEXT: v_readlane_b32 s49, v40, 17 ; GCN-NEXT: v_readlane_b32 s48, v40, 16 @@ -560,7 +550,7 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v40, s16, 18 +; GISEL-NEXT: v_writelane_b32 v40, s16, 16 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -578,8 +568,6 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s45, 13 ; GISEL-NEXT: v_writelane_b32 v40, s46, 14 ; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 ; GISEL-NEXT: s_mov_b32 s42, s15 ; GISEL-NEXT: s_mov_b32 s43, s14 ; GISEL-NEXT: s_mov_b32 s44, s13 @@ -588,12 +576,11 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] ; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] -; GISEL-NEXT: s_mov_b64 s[46:47], exec ; GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[46:47], vcc ; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] @@ -606,13 +593,11 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49] -; GISEL-NEXT: s_cbranch_execnz .LBB4_1 +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[46:47] +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[46:47] +; GISEL-NEXT: s_cbranch_scc1 .LBB4_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[46:47] ; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v1 -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 ; GISEL-NEXT: v_readlane_b32 s47, v40, 15 ; GISEL-NEXT: v_readlane_b32 s46, v40, 14 ; GISEL-NEXT: v_readlane_b32 s45, v40, 13 @@ -629,7 +614,7 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v40, 18 +; GISEL-NEXT: v_readlane_b32 s4, v40, 16 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] @@ -683,8 +668,10 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc -; GCN-NEXT: s_cbranch_execz .LBB5_4 +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_mov_b64 s[46:47], exec +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB5_4 ; GCN-NEXT: ; %bb.1: ; %bb1 ; GCN-NEXT: s_mov_b64 s[48:49], exec ; GCN-NEXT: .LBB5_2: ; =>This Inner Loop Header: Depth=1 @@ -703,12 +690,12 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_xor_b64 exec, exec, s[50:51] -; GCN-NEXT: s_cbranch_execnz .LBB5_2 +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[50:51] +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[48:49] +; GCN-NEXT: s_cbranch_scc1 .LBB5_2 ; GCN-NEXT: ; %bb.3: -; GCN-NEXT: s_mov_b64 exec, s[48:49] -; GCN-NEXT: .LBB5_4: ; %bb2 ; GCN-NEXT: s_or_b64 exec, exec, s[46:47] +; GCN-NEXT: .LBB5_4: ; %bb2 ; GCN-NEXT: v_readlane_b32 s51, v40, 19 ; GCN-NEXT: v_readlane_b32 s50, v40, 18 ; GCN-NEXT: v_readlane_b32 s49, v40, 17 @@ -746,7 +733,7 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v40, s16, 20 +; GISEL-NEXT: v_writelane_b32 v40, s16, 18 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -766,8 +753,6 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: v_writelane_b32 v40, s47, 15 ; GISEL-NEXT: v_writelane_b32 v40, s48, 16 ; GISEL-NEXT: v_writelane_b32 v40, s49, 17 -; GISEL-NEXT: v_writelane_b32 v40, s50, 18 -; GISEL-NEXT: v_writelane_b32 v40, s51, 19 ; GISEL-NEXT: s_mov_b32 s42, s15 ; GISEL-NEXT: s_mov_b32 s43, s14 ; GISEL-NEXT: s_mov_b32 s44, s13 @@ -778,15 +763,15 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, 1, v2 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: s_and_saveexec_b64 s[46:47], vcc -; GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GISEL-NEXT: ; %bb.1: ; %bb1 -; GISEL-NEXT: s_mov_b64 s[48:49], exec -; GISEL-NEXT: .LBB5_2: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_mov_b64 s[46:47], exec +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB5_3 +; GISEL-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[50:51], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] @@ -798,14 +783,12 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[50:51] -; GISEL-NEXT: s_cbranch_execnz .LBB5_2 -; GISEL-NEXT: ; %bb.3: -; GISEL-NEXT: s_mov_b64 exec, s[48:49] -; GISEL-NEXT: .LBB5_4: ; %bb2 +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[48:49] +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[48:49] +; GISEL-NEXT: s_cbranch_scc1 .LBB5_1 +; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_or_b64 exec, exec, s[46:47] -; GISEL-NEXT: v_readlane_b32 s51, v40, 19 -; GISEL-NEXT: v_readlane_b32 s50, v40, 18 +; GISEL-NEXT: .LBB5_3: ; %bb2 ; GISEL-NEXT: v_readlane_b32 s49, v40, 17 ; GISEL-NEXT: v_readlane_b32 s48, v40, 16 ; GISEL-NEXT: v_readlane_b32 s47, v40, 15 @@ -824,7 +807,7 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v40, 20 +; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] @@ -847,11 +830,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s5, s33 +; GCN-NEXT: s_mov_b32 s12, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -894,10 +877,10 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_xor_b64 exec, exec, s[10:11] -; GCN-NEXT: s_cbranch_execnz .LBB6_1 +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[10:11] +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GCN-NEXT: s_cbranch_scc1 .LBB6_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_readlane_b32 s63, v40, 31 ; GCN-NEXT: v_readlane_b32 s62, v40, 30 ; GCN-NEXT: v_readlane_b32 s61, v40, 29 @@ -930,22 +913,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s5 +; GCN-NEXT: s_mov_b32 s33, s12 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_inreg_arg: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s5, s33 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 -; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: s_mov_b64 exec, s[6:7] +; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -979,19 +962,18 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s61, 29 ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 -; GISEL-NEXT: s_mov_b64 s[6:7], exec ; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_readfirstlane_b32 s8, v0 -; GISEL-NEXT: v_readfirstlane_b32 s9, v1 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GISEL-NEXT: v_readfirstlane_b32 s7, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] +; GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GISEL-NEXT: s_movk_i32 s4, 0x7b -; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11] -; GISEL-NEXT: s_cbranch_execnz .LBB6_1 +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; GISEL-NEXT: s_cbranch_scc1 .LBB6_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: v_readlane_b32 s63, v40, 31 ; GISEL-NEXT: v_readlane_b32 s62, v40, 30 ; GISEL-NEXT: v_readlane_b32 s61, v40, 29 @@ -1024,11 +1006,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GISEL-NEXT: s_mov_b64 exec, s[6:7] +; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s5 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void %fptr(i32 inreg 123) @@ -1088,10 +1070,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GCN-NEXT: v_mov_b32_e32 v0, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] -; GCN-NEXT: s_cbranch_execnz .LBB7_1 +; GCN-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; GCN-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN-NEXT: s_cbranch_scc1 .LBB7_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v40 ; GCN-NEXT: v_readlane_b32 s63, v41, 31 ; GCN-NEXT: v_readlane_b32 s62, v41, 30 @@ -1137,7 +1119,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s8, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -1177,19 +1159,18 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v41, s62, 30 ; GISEL-NEXT: v_writelane_b32 v41, s63, 31 ; GISEL-NEXT: v_mov_b32_e32 v40, v0 -; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_readfirstlane_b32 s6, v1 -; GISEL-NEXT: v_readfirstlane_b32 s7, v2 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] -; GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GISEL-NEXT: v_readfirstlane_b32 s5, v2 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2] +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v40 -; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GISEL-NEXT: ; implicit-def: $vgpr1 -; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] -; GISEL-NEXT: s_cbranch_execnz .LBB7_1 +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, v40 ; GISEL-NEXT: v_readlane_b32 s63, v41, 31 ; GISEL-NEXT: v_readlane_b32 s62, v41, 30 @@ -1228,7 +1209,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s8 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void %fptr(i32 %i) @@ -1291,10 +1272,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-NEXT: v_mov_b32_e32 v3, v0 ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_xor_b64 exec, exec, s[6:7] -; GCN-NEXT: s_cbranch_execnz .LBB8_1 +; GCN-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GCN-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN-NEXT: s_cbranch_scc1 .LBB8_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v3 ; GCN-NEXT: v_readlane_b32 s63, v40, 31 ; GCN-NEXT: v_readlane_b32 s62, v40, 30 @@ -1339,7 +1320,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s8, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1377,20 +1358,19 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s61, 29 ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 -; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_readfirstlane_b32 s8, v1 -; GISEL-NEXT: v_readfirstlane_b32 s9, v2 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GISEL-NEXT: v_readfirstlane_b32 s6, v1 +; GISEL-NEXT: v_readfirstlane_b32 s7, v2 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GISEL-NEXT: ; implicit-def: $vgpr1 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: s_xor_b64 exec, exec, s[6:7] -; GISEL-NEXT: s_cbranch_execnz .LBB8_1 +; GISEL-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB8_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, v2 ; GISEL-NEXT: v_readlane_b32 s63, v40, 31 ; GISEL-NEXT: v_readlane_b32 s62, v40, 30 @@ -1428,7 +1408,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s8 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call amdgpu_gfx i32 %fptr(i32 %i) @@ -1486,10 +1466,10 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] -; GCN-NEXT: s_cbranch_execnz .LBB9_1 +; GCN-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; GCN-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN-NEXT: s_cbranch_scc1 .LBB9_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_readlane_b32 s63, v40, 31 ; GCN-NEXT: v_readlane_b32 s62, v40, 30 ; GCN-NEXT: v_readlane_b32 s61, v40, 29 @@ -1533,7 +1513,7 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GISEL-LABEL: test_indirect_tail_call_vgpr_ptr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s8, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1571,18 +1551,17 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s61, 29 ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 -; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_readfirstlane_b32 s6, v0 -; GISEL-NEXT: v_readfirstlane_b32 s7, v1 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] -; GISEL-NEXT: s_cbranch_execnz .LBB9_1 +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: v_readlane_b32 s63, v40, 31 ; GISEL-NEXT: v_readlane_b32 s62, v40, 30 ; GISEL-NEXT: v_readlane_b32 s61, v40, 29 @@ -1619,7 +1598,7 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s8 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] tail call amdgpu_gfx void %fptr() diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index f54a511eff7f1d..3b5e460a4aef86 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -27,6 +27,7 @@ define amdgpu_kernel void @infinite_loop(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP]], label [[DUMMYRETURNBLOCK:%.*]] ; IR: DummyReturnBlock: ; IR-NEXT: ret void +; entry: br label %loop @@ -66,6 +67,7 @@ define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP]], label [[UNIFIEDRETURNBLOCK]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void +; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond = icmp eq i32 %tmp, 1 @@ -129,6 +131,7 @@ define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP2]], label [[DUMMYRETURNBLOCK]] ; IR: DummyReturnBlock: ; IR-NEXT: ret void +; entry: br i1 undef, label %loop1, label %loop2 @@ -189,6 +192,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; IR-NEXT: br i1 [[COND3]], label [[INNER_LOOP]], label [[OUTER_LOOP]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void +; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond1 = icmp ne i32 %tmp, 1 ; avoid following BB optimizing away through the domination diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index cddfb21a6fbdf4..64de02364f659c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -57,16 +57,18 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_mov_b32 s12, s13 ; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b32 s20, exec_lo ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_mov_b32 s0, -1 -; GFX11-NEXT: s_mov_b32 s20, exec_lo ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v0, s21, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB2_13 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB2_13 ; GFX11-NEXT: ; %bb.1: ; %bb14 ; GFX11-NEXT: s_load_b128 s[16:19], s[2:3], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -171,10 +173,13 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: .LBB2_12: ; %Flow11 ; GFX11-NEXT: s_and_b32 s6, s1, exec_lo ; GFX11-NEXT: s_or_not1_b32 s0, s17, exec_lo -; GFX11-NEXT: .LBB2_13: ; %Flow9 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s20 -; GFX11-NEXT: s_and_saveexec_b32 s7, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_15 +; GFX11-NEXT: .LBB2_13: ; %Flow9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, exec_lo +; GFX11-NEXT: s_and_b32 s0, s0, exec_lo +; GFX11-NEXT: s_cmov_b32 exec_lo, s0 +; GFX11-NEXT: s_cbranch_scc0 .LBB2_15 ; GFX11-NEXT: ; %bb.14: ; %bb43 ; GFX11-NEXT: s_add_u32 s8, s2, 0x58 ; GFX11-NEXT: s_addc_u32 s9, s3, 0 @@ -187,12 +192,15 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_or_b32 s6, s6, exec_lo -; GFX11-NEXT: .LBB2_15: ; %Flow14 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; GFX11-NEXT: s_and_saveexec_b32 s0, s6 +; GFX11-NEXT: .LBB2_15: ; %Flow14 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s6, exec_lo +; GFX11-NEXT: s_cmov_b32 exec_lo, s0 +; GFX11-NEXT: s_cbranch_scc0 .LBB2_17 ; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock ; GFX11-NEXT: ; divergent unreachable -; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock +; GFX11-NEXT: .LBB2_17: ; %UnifiedReturnBlock ; GFX11-NEXT: s_endpgm bb: %i = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index 3b972352e0e450..32215a142610ca 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -23,11 +23,11 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: syncscope_workgroup_nortn: @@ -43,11 +43,11 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: syncscope_workgroup_nortn: @@ -66,10 +66,10 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLATSCR-LABEL: syncscope_workgroup_nortn: @@ -85,11 +85,11 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-FLATSCR-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-FLATSCR-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-FLATSCR-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-FLATSCR-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-FLATSCR-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: syncscope_workgroup_nortn: @@ -100,7 +100,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -109,11 +108,11 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: syncscope_workgroup_nortn: @@ -153,10 +152,10 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -178,10 +177,10 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -203,10 +202,10 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -226,10 +225,10 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX9-FLATSCR-NEXT: buffer_wbinvl1_vol ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX9-FLATSCR-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-FLATSCR-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-FLATSCR-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-FLATSCR-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-FLATSCR-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -252,11 +251,11 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB1_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -283,11 +282,11 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB1_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst @@ -683,8 +682,9 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -703,8 +703,9 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB5_2 +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -722,8 +723,9 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX10-NEXT: s_mov_b32 s0, exec_lo ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB5_2 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dword s1, s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -743,8 +745,9 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-FLATSCR-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-FLATSCR-NEXT: s_cmov_b64 exec, vcc +; GFX9-FLATSCR-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; GFX9-FLATSCR-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -760,11 +763,12 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX11-LABEL: atomic_add_local: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -781,11 +785,12 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX12-LABEL: atomic_add_local: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB5_2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -882,9 +887,11 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -894,8 +901,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB7_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB7_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -911,9 +918,11 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB7_2 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -923,8 +932,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB7_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX90A-NEXT: .LBB7_2: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 @@ -937,11 +946,13 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX10-LABEL: atomic_add_ret_local: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s1, exec_lo -; GFX10-NEXT: ; implicit-def: $vgpr1 +; GFX10-NEXT: s_mov_b32 s0, exec_lo ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB7_2 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -952,9 +963,9 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: .LBB7_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: .LBB7_2: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 @@ -970,9 +981,11 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], exec +; GFX9-FLATSCR-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-FLATSCR-NEXT: s_cmov_b64 exec, vcc +; GFX9-FLATSCR-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; GFX9-FLATSCR-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -982,8 +995,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: .LBB7_2: ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT: .LBB7_2: ; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 @@ -1000,8 +1013,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1012,8 +1027,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: .LBB7_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: .LBB7_2: ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 @@ -1031,8 +1046,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB7_2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1044,8 +1061,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: .LBB7_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: .LBB7_2: ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 @@ -1072,9 +1089,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1083,8 +1102,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -1100,9 +1119,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB8_2 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1111,8 +1132,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: .LBB8_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX90A-NEXT: .LBB8_2: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 @@ -1125,11 +1146,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: add_i32_constant: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s1, exec_lo -; GFX10-NEXT: ; implicit-def: $vgpr1 +; GFX10-NEXT: s_mov_b32 s0, exec_lo ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB8_2 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1138,9 +1161,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: .LBB8_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: .LBB8_2: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 @@ -1156,9 +1179,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], exec +; GFX9-FLATSCR-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-FLATSCR-NEXT: s_cmov_b64 exec, vcc +; GFX9-FLATSCR-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -1167,8 +1192,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: .LBB8_2: ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT: .LBB8_2: ; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 @@ -1185,8 +1210,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,8 +1223,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: .LBB8_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: .LBB8_2: ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 @@ -1215,8 +1242,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB8_2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1226,8 +1255,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: .LBB8_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: .LBB8_2: ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll index 0adce2b84aa0d3..ab7fc91e58b6a0 100644 --- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll +++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll @@ -40,15 +40,15 @@ define void @issue92561(ptr addrspace(1) %arg) { ; SDAG-NEXT: s_and_b32 s0, s0, s1 ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; SDAG-NEXT: s_and_b32 s0, s0, s2 -; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; SDAG-NEXT: s_and_saveexec_b32 s0, s0 ; SDAG-NEXT: image_sample_c_lz v9, [v8, v8, v8, v8], s[4:11], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; SDAG-NEXT: s_cbranch_execnz .LBB0_1 +; SDAG-NEXT: s_cselect_b32 exec_lo, s0, s3 +; SDAG-NEXT: s_cbranch_scc1 .LBB0_1 ; SDAG-NEXT: ; %bb.2: -; SDAG-NEXT: s_mov_b32 exec_lo, s3 ; SDAG-NEXT: v_dual_mov_b32 v0, 0x7fc00000 :: v_dual_mov_b32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 ; SDAG-NEXT: s_mov_b32 s0, s12 @@ -84,7 +84,7 @@ define void @issue92561(ptr addrspace(1) %arg) { ; GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_mov_b32 s20, 0 -; GISEL-NEXT: s_mov_b32 s3, exec_lo +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s21, s20 ; GISEL-NEXT: s_mov_b32 s22, s20 ; GISEL-NEXT: s_mov_b32 s23, s20 @@ -116,15 +116,15 @@ define void @issue92561(ptr addrspace(1) %arg) { ; GISEL-NEXT: s_and_b32 s0, s0, s1 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GISEL-NEXT: s_and_b32 s0, s0, s2 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GISEL-NEXT: s_and_saveexec_b32 s0, s0 ; GISEL-NEXT: image_sample_c_lz v1, [v0, v0, v0, v0], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GISEL-NEXT: s_xor_b32 s1, exec_lo, s0 ; GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GISEL-NEXT: s_cbranch_execnz .LBB0_1 +; GISEL-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GISEL-NEXT: s_cbranch_scc1 .LBB0_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b32 exec_lo, s3 ; GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1.0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7fc00000 ; GISEL-NEXT: s_clause 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll index 3e6de324924579..a0ebf9b89d8265 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll @@ -13,9 +13,11 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_or_b32_e32 v5, v1, v3 ; GCN-NEXT: v_or_b32_e32 v4, v0, v2 ; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_14 +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_14 ; GCN-NEXT: ; %bb.1: ; %itofp-if-end ; GCN-NEXT: v_sub_co_u32_e32 v4, vcc, 0, v0 ; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc @@ -39,10 +41,12 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_add_u32_e32 v6, 64, v6 ; GCN-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc ; GCN-NEXT: v_sub_u32_e32 v6, 0x80, v7 -; GCN-NEXT: v_sub_u32_e32 v2, 0x7f, v7 ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_xor_b64 s[4:5], vcc, exec +; GCN-NEXT: v_sub_u32_e32 v2, 0x7f, v7 +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_3 ; GCN-NEXT: ; %bb.2: ; %itofp-if-else ; GCN-NEXT: v_add_u32_e32 v4, 0xffffff98, v7 ; GCN-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] @@ -52,18 +56,24 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr7 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GCN-NEXT: ; %bb.3: ; %Flow3 -; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_13 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB0_3: ; %Flow3 +; GCN-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GCN-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB0_13 ; GCN-NEXT: ; %bb.4: ; %NodeBlock ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_8 +; GCN-NEXT: s_xor_b64 s[10:11], vcc, exec +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_8 ; GCN-NEXT: ; %bb.5: ; %LeafBlock ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 -; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_7 +; GCN-NEXT: s_mov_b64 s[12:13], exec +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_7 ; GCN-NEXT: ; %bb.6: ; %itofp-sw-default ; GCN-NEXT: v_sub_u32_e32 v12, 0x66, v7 ; GCN-NEXT: v_sub_u32_e32 v10, 64, v12 @@ -102,29 +112,36 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_or_b32_e32 v8, v15, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v8 ; GCN-NEXT: v_mov_b32_e32 v1, v9 -; GCN-NEXT: .LBB0_7: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN-NEXT: .LBB0_7: ; %Flow1 +; GCN-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-NEXT: .LBB0_8: ; %Flow2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GCN-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GCN-NEXT: s_cmov_b64 exec, s[10:11] +; GCN-NEXT: s_cbranch_scc0 .LBB0_10 ; GCN-NEXT: ; %bb.9: ; %itofp-sw-bb ; GCN-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GCN-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB0_10: ; %itofp-sw-epilog ; GCN-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GCN-NEXT: v_and_or_b32 v0, v4, 1, v0 ; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_and_b32_e32 v4, 0x4000000, v0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: v_alignbit_b32 v8, v1, v0, 2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_12 ; GCN-NEXT: ; %bb.11: ; %itofp-if-then20 ; GCN-NEXT: v_alignbit_b32 v8, v1, v0, 3 ; GCN-NEXT: v_mov_b32_e32 v2, v6 -; GCN-NEXT: ; %bb.12: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: .LBB0_13: ; %Flow4 +; GCN-NEXT: .LBB0_12: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: .LBB0_13: ; %itofp-if-end26 ; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v3 ; GCN-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v8 @@ -136,8 +153,8 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: .LBB0_14: ; %Flow5 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: .LBB0_14: ; %itofp-return ; GCN-NEXT: v_mov_b32_e32 v0, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] %cvt = sitofp i128 %x to bfloat @@ -151,9 +168,11 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_or_b32_e32 v5, v1, v3 ; GCN-NEXT: v_or_b32_e32 v4, v0, v2 ; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_14 +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_14 ; GCN-NEXT: ; %bb.1: ; %itofp-if-end ; GCN-NEXT: v_ffbh_u32_e32 v4, v2 ; GCN-NEXT: v_add_u32_e32 v4, 32, v4 @@ -167,11 +186,13 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_add_u32_e32 v5, 64, v5 ; GCN-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GCN-NEXT: v_sub_u32_e32 v5, 0x80, v6 -; GCN-NEXT: v_sub_u32_e32 v4, 0x7f, v6 ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, exec +; GCN-NEXT: v_sub_u32_e32 v4, 0x7f, v6 +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 ; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_3 ; GCN-NEXT: ; %bb.2: ; %itofp-if-else ; GCN-NEXT: v_add_u32_e32 v2, 0xffffff98, v6 ; GCN-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -181,18 +202,24 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr6 ; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN-NEXT: ; %bb.3: ; %Flow3 -; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_13 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB1_3: ; %Flow3 +; GCN-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GCN-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB1_13 ; GCN-NEXT: ; %bb.4: ; %NodeBlock ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_8 +; GCN-NEXT: s_xor_b64 s[10:11], vcc, exec +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_8 ; GCN-NEXT: ; %bb.5: ; %LeafBlock ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5 -; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_7 +; GCN-NEXT: s_mov_b64 s[12:13], exec +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_7 ; GCN-NEXT: ; %bb.6: ; %itofp-sw-default ; GCN-NEXT: v_sub_u32_e32 v11, 0x66, v6 ; GCN-NEXT: v_sub_u32_e32 v9, 64, v11 @@ -231,29 +258,36 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_or_b32_e32 v7, v14, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v7 ; GCN-NEXT: v_mov_b32_e32 v1, v8 -; GCN-NEXT: .LBB1_7: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN-NEXT: .LBB1_7: ; %Flow1 +; GCN-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-NEXT: .LBB1_8: ; %Flow2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GCN-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GCN-NEXT: s_cmov_b64 exec, s[10:11] +; GCN-NEXT: s_cbranch_scc0 .LBB1_10 ; GCN-NEXT: ; %bb.9: ; %itofp-sw-bb ; GCN-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GCN-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB1_10: ; %itofp-sw-epilog ; GCN-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_and_or_b32 v0, v2, 1, v0 ; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: v_alignbit_b32 v7, v1, v0, 2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_12 ; GCN-NEXT: ; %bb.11: ; %itofp-if-then20 ; GCN-NEXT: v_alignbit_b32 v7, v1, v0, 3 ; GCN-NEXT: v_mov_b32_e32 v4, v5 -; GCN-NEXT: ; %bb.12: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: .LBB1_13: ; %Flow4 +; GCN-NEXT: .LBB1_12: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: .LBB1_13: ; %itofp-if-end26 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffff, v7 ; GCN-NEXT: v_lshl_or_b32 v0, v4, 23, v0 ; GCN-NEXT: v_add_u32_e32 v0, 1.0, v0 @@ -264,8 +298,8 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: .LBB1_14: ; %Flow5 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: .LBB1_14: ; %itofp-return ; GCN-NEXT: v_mov_b32_e32 v0, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i128 %x to bfloat diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index c5198cdb421a50..813809976838b9 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -9,9 +9,11 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: s_mov_b64 s[6:7], exec ; SDAG-NEXT: v_mov_b32_e32 v4, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB0_14 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_sub_co_u32_e32 v4, vcc, 0, v0 ; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc @@ -35,10 +37,12 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_add_u32_e32 v6, 64, v6 ; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc ; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7 -; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: s_xor_b64 s[4:5], vcc, exec +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_3 ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v7 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] @@ -48,18 +52,24 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB0_13 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB0_3: ; %Flow3 +; SDAG-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB0_8 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_cbranch_execz .LBB0_7 +; SDAG-NEXT: s_mov_b64 s[12:13], exec +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default ; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v7 ; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 @@ -98,35 +108,42 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v8, v15, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v8 ; SDAG-NEXT: v_mov_b32_e32 v1, v9 -; SDAG-NEXT: .LBB0_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB0_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB0_8: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_cmp_lg_u64 s[10:11], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_10 ; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb ; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB0_10: ; %itofp-sw-epilog ; SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; SDAG-NEXT: v_and_or_b32 v0, v4, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_and_b32_e32 v4, 0x4000000, v0 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SDAG-NEXT: s_mov_b64 s[4:5], exec ; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 2 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_12 ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 3 ; SDAG-NEXT: v_mov_b32_e32 v2, v6 -; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB0_13: ; %Flow4 +; SDAG-NEXT: .LBB0_12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB0_13: ; %itofp-if-end26 ; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3 ; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 ; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v8 ; SDAG-NEXT: v_or3_b32 v4, v2, v0, v1 -; SDAG-NEXT: .LBB0_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB0_14: ; %itofp-return ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -137,9 +154,11 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], exec ; GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB0_14 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; GISEL-NEXT: v_xor_b32_e32 v0, v6, v0 @@ -162,11 +181,13 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc ; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v5 -; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 ; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v8 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr4 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_3 ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else ; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -176,18 +197,24 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 -; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB0_13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB0_3: ; %Flow3 +; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock ; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB0_8 +; GISEL-NEXT: s_xor_b64 s[10:11], vcc, exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB0_7 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default ; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 ; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4 @@ -230,36 +257,43 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: v_mov_b32_e32 v3, v6 -; GISEL-NEXT: .LBB0_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB0_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[10:11] ; GISEL-NEXT: .LBB0_8: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[10:11] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB0_10: ; %itofp-sw-epilog ; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_12 ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB0_13: ; %Flow4 +; GISEL-NEXT: .LBB0_12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB0_13: ; %itofp-if-end26 ; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 ; GISEL-NEXT: v_lshl_add_u32 v1, v7, 23, 1.0 ; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 ; GISEL-NEXT: v_or3_b32 v4, v2, v0, v1 -; GISEL-NEXT: .LBB0_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB0_14: ; %itofp-return ; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = sitofp i128 %x to float @@ -273,9 +307,11 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: s_mov_b64 s[6:7], exec ; SDAG-NEXT: v_mov_b32_e32 v4, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB1_14 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 ; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 @@ -289,11 +325,13 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 ; SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; SDAG-NEXT: v_sub_u32_e32 v5, 0x80, v6 -; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: s_xor_b64 s[4:5], vcc, exec +; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 ; SDAG-NEXT: ; implicit-def: $vgpr7 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_3 ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v6 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -303,18 +341,24 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB1_13 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB1_3: ; %Flow3 +; SDAG-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB1_8 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_cbranch_execz .LBB1_7 +; SDAG-NEXT: s_mov_b64 s[12:13], exec +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default ; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v6 ; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 @@ -353,34 +397,41 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v7, v14, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v7 ; SDAG-NEXT: v_mov_b32_e32 v1, v8 -; SDAG-NEXT: .LBB1_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB1_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB1_8: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_cmp_lg_u64 s[10:11], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_10 ; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb ; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB1_10: ; %itofp-sw-epilog ; SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; SDAG-NEXT: v_and_or_b32 v0, v2, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SDAG-NEXT: s_mov_b64 s[4:5], exec ; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 2 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_12 ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 3 ; SDAG-NEXT: v_mov_b32_e32 v4, v5 -; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB1_13: ; %Flow4 +; SDAG-NEXT: .LBB1_12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB1_13: ; %itofp-if-end26 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v7 ; SDAG-NEXT: v_lshl_or_b32 v0, v4, 23, v0 ; SDAG-NEXT: v_add_u32_e32 v4, 1.0, v0 -; SDAG-NEXT: .LBB1_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB1_14: ; %itofp-return ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -391,9 +442,11 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], exec ; GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB1_14 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end ; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 ; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 @@ -407,11 +460,13 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc ; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5 -; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 ; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr4 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_3 ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else ; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -421,18 +476,24 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 -; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB1_13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB1_3: ; %Flow3 +; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock ; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB1_8 +; GISEL-NEXT: s_xor_b64 s[10:11], vcc, exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB1_7 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default ; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 ; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4 @@ -475,35 +536,42 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: v_mov_b32_e32 v3, v6 -; GISEL-NEXT: .LBB1_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB1_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[10:11] ; GISEL-NEXT: .LBB1_8: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[10:11] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB1_10: ; %itofp-sw-epilog ; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_12 ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v6, v7 -; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB1_13: ; %Flow4 +; GISEL-NEXT: .LBB1_12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB1_13: ; %itofp-if-end26 ; GISEL-NEXT: v_lshl_add_u32 v0, v6, 23, 1.0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GISEL-NEXT: v_and_or_b32 v4, v4, v1, v0 -; GISEL-NEXT: .LBB1_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB1_14: ; %itofp-return ; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i128 %x to float @@ -520,9 +588,11 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v0, v4, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB2_14 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, 0, v4 ; SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v5, vcc @@ -546,11 +616,13 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_add_u32_e32 v1, 64, v1 ; SDAG-NEXT: v_cndmask_b32_e32 v9, v1, v0, vcc ; SDAG-NEXT: v_sub_u32_e32 v8, 0x80, v9 -; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v9 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v8 +; SDAG-NEXT: s_xor_b64 s[4:5], vcc, exec +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v9 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_3 ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else ; SDAG-NEXT: v_add_u32_e32 v6, 0xffffffb5, v9 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] @@ -561,18 +633,24 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_13 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB2_3: ; %Flow3 +; SDAG-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v8 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_8 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v8 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_cbranch_execz .LBB2_7 +; SDAG-NEXT: s_mov_b64 s[12:13], exec +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default ; SDAG-NEXT: v_sub_u32_e32 v12, 0x49, v9 ; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 @@ -616,44 +694,51 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_mov_b32_e32 v5, v1 ; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_mov_b32_e32 v7, v11 -; SDAG-NEXT: .LBB2_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB2_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB2_8: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_cmp_lg_u64 s[10:11], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_10 ; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb ; SDAG-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; SDAG-NEXT: v_lshrrev_b32_e32 v0, 31, v5 ; SDAG-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; SDAG-NEXT: v_or_b32_e32 v6, v6, v0 -; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB2_10: ; %itofp-sw-epilog ; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v4 ; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v4 ; SDAG-NEXT: v_add_co_u32_e32 v4, vcc, 1, v0 ; SDAG-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; SDAG-NEXT: v_lshrrev_b64 v[0:1], 2, v[4:5] +; SDAG-NEXT: v_and_b32_e32 v9, 0x800000, v5 ; SDAG-NEXT: v_lshlrev_b32_e32 v7, 30, v6 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SDAG-NEXT: s_mov_b64 s[4:5], exec ; SDAG-NEXT: v_or_b32_e32 v10, v1, v7 -; SDAG-NEXT: v_and_b32_e32 v1, 0x800000, v5 -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_12 ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], 3, v[4:5] ; SDAG-NEXT: v_lshlrev_b32_e32 v2, 29, v6 ; SDAG-NEXT: v_or_b32_e32 v10, v1, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v8 -; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB2_13: ; %Flow4 +; SDAG-NEXT: .LBB2_12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB2_13: ; %itofp-if-end26 ; SDAG-NEXT: v_and_b32_e32 v1, 0x80000000, v3 ; SDAG-NEXT: v_mov_b32_e32 v3, 0x3ff00000 ; SDAG-NEXT: v_lshl_add_u32 v2, v2, 20, v3 ; SDAG-NEXT: v_and_b32_e32 v3, 0xfffff, v10 ; SDAG-NEXT: v_or3_b32 v1, v3, v1, v2 -; SDAG-NEXT: .LBB2_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB2_14: ; %itofp-return ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: sitofp_i128_to_f64: @@ -661,14 +746,16 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v4, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_or_b32_e32 v0, v4, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v5, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[6:7], exec ; GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB2_14 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; GISEL-NEXT: v_xor_b32_e32 v0, v6, v4 @@ -691,12 +778,14 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v9, v5, v4, vcc ; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v9 -; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v9 ; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v8 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v9 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr10 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_3 ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else ; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v9 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -706,18 +795,24 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB2_13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB2_3: ; %Flow3 +; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock ; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB2_8 +; GISEL-NEXT: s_xor_b64 s[10:11], vcc, exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB2_7 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default ; GISEL-NEXT: v_sub_u32_e32 v14, 0x49, v9 ; GISEL-NEXT: v_sub_u32_e32 v10, 64, v14 @@ -762,10 +857,14 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: v_mov_b32_e32 v3, v6 -; GISEL-NEXT: .LBB2_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB2_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[10:11] ; GISEL-NEXT: .LBB2_8: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[10:11] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] @@ -775,27 +874,30 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v10 ; GISEL-NEXT: v_mov_b32_e32 v2, v11 ; GISEL-NEXT: v_mov_b32_e32 v3, v12 -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB2_10: ; %itofp-sw-epilog ; GISEL-NEXT: v_bfe_u32 v3, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v9, 0 ; GISEL-NEXT: v_and_b32_e32 v10, 0x800000, v1 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[9:10] +; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], exec ; GISEL-NEXT: v_lshl_or_b32 v10, v2, 30, v5 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_12 ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v7, v8 ; GISEL-NEXT: v_lshl_or_b32 v10, v2, 29, v5 -; GISEL-NEXT: ; %bb.12: ; %Flow -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB2_13: ; %Flow4 +; GISEL-NEXT: s_or_b64 exec, exec, s[10:11] +; GISEL-NEXT: .LBB2_12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB2_13: ; %itofp-if-end26 ; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xfffff @@ -803,8 +905,8 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v2, v10, v2, v0 ; GISEL-NEXT: v_and_or_b32 v0, v4, -1, 0 ; GISEL-NEXT: v_or3_b32 v1, v2, v1, 0 -; GISEL-NEXT: .LBB2_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB2_14: ; %itofp-return ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = sitofp i128 %x to double ret double %cvt @@ -818,9 +920,11 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], exec ; SDAG-NEXT: v_mov_b32_e32 v5, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB3_14 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 ; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 @@ -834,12 +938,14 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 ; SDAG-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc ; SDAG-NEXT: v_sub_u32_e32 v7, 0x80, v8 -; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v8 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v7 +; SDAG-NEXT: s_xor_b64 s[4:5], vcc, exec +; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v8 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_3 ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffffb5, v8 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -850,18 +956,24 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB3_13 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB3_3: ; %Flow3 +; SDAG-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v7 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB3_8 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v7 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_cbranch_execz .LBB3_7 +; SDAG-NEXT: s_mov_b64 s[12:13], exec +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default ; SDAG-NEXT: v_sub_u32_e32 v11, 0x49, v8 ; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 @@ -905,40 +1017,47 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: v_mov_b32_e32 v3, v10 -; SDAG-NEXT: .LBB3_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB3_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB3_8: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_cmp_lg_u64 s[10:11], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_10 ; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb ; SDAG-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; SDAG-NEXT: v_lshrrev_b32_e32 v3, 31, v1 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; SDAG-NEXT: v_or_b32_e32 v2, v2, v3 -; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB3_10: ; %itofp-sw-epilog ; SDAG-NEXT: v_lshrrev_b32_e32 v3, 2, v0 ; SDAG-NEXT: v_and_or_b32 v0, v3, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; SDAG-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; SDAG-NEXT: v_and_b32_e32 v3, 0x800000, v1 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SDAG-NEXT: s_mov_b64 s[4:5], exec ; SDAG-NEXT: v_alignbit_b32 v9, v2, v1, 2 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_12 ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; SDAG-NEXT: v_alignbit_b32 v9, v2, v1, 3 ; SDAG-NEXT: v_mov_b32_e32 v6, v7 -; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB3_13: ; %Flow4 +; SDAG-NEXT: .LBB3_12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB3_13: ; %itofp-if-end26 ; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v9 ; SDAG-NEXT: v_lshl_or_b32 v0, v6, 20, v0 ; SDAG-NEXT: v_add_u32_e32 v5, 0x3ff00000, v0 -; SDAG-NEXT: .LBB3_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB3_14: ; %itofp-return ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -946,14 +1065,16 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-LABEL: uitofp_i128_to_f64: ; GISEL: ; %bb.0: ; %itofp-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: s_mov_b64 s[6:7], exec ; GISEL-NEXT: v_mov_b32_e32 v5, s5 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB3_14 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end ; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 ; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 @@ -967,12 +1088,14 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc ; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v8 -; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v8 ; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v7 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v8 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_3 ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else ; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v8 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -982,18 +1105,24 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: ; implicit-def: $vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB3_13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB3_3: ; %Flow3 +; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock ; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v7 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB3_8 +; GISEL-NEXT: s_xor_b64 s[10:11], vcc, exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB3_7 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default ; GISEL-NEXT: v_sub_u32_e32 v13, 0x49, v8 ; GISEL-NEXT: v_sub_u32_e32 v9, 64, v13 @@ -1039,10 +1168,14 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v9 ; GISEL-NEXT: v_mov_b32_e32 v2, v10 ; GISEL-NEXT: v_mov_b32_e32 v3, v11 -; GISEL-NEXT: .LBB3_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB3_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[10:11] ; GISEL-NEXT: .LBB3_8: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[10:11] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] @@ -1052,8 +1185,8 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v9 ; GISEL-NEXT: v_mov_b32_e32 v2, v10 ; GISEL-NEXT: v_mov_b32_e32 v3, v11 -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB3_10: ; %itofp-sw-epilog ; GISEL-NEXT: v_bfe_u32 v4, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 @@ -1066,25 +1199,28 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GISEL-NEXT: v_lshlrev_b64 v[8:9], 30, v[2:3] ; GISEL-NEXT: v_lshrrev_b32_e32 v5, 2, v1 +; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: v_or_b32_e32 v9, v5, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_12 ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshlrev_b64 v[2:3], 29, v[2:3] ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 3, v1 ; GISEL-NEXT: v_or_b32_e32 v9, v0, v2 ; GISEL-NEXT: v_mov_b32_e32 v6, v7 -; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB3_13: ; %Flow4 +; GISEL-NEXT: .LBB3_12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB3_13: ; %itofp-if-end26 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff00000 ; GISEL-NEXT: v_lshl_add_u32 v0, v6, 20, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xfffff, v9 ; GISEL-NEXT: v_and_or_b32 v4, v4, -1, 0 ; GISEL-NEXT: v_or3_b32 v5, v1, v0, 0 -; GISEL-NEXT: .LBB3_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB3_14: ; %itofp-return ; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1099,9 +1235,11 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: s_mov_b64 s[6:7], exec ; SDAG-NEXT: v_mov_b32_e32 v4, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB4_14 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB4_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_sub_co_u32_e32 v4, vcc, 0, v0 ; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc @@ -1125,10 +1263,12 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_add_u32_e32 v6, 64, v6 ; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc ; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7 -; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: s_xor_b64 s[4:5], vcc, exec +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB4_3 ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v7 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] @@ -1138,18 +1278,24 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB4_13 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB4_3: ; %Flow3 +; SDAG-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB4_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB4_8 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB4_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_cbranch_execz .LBB4_7 +; SDAG-NEXT: s_mov_b64 s[12:13], exec +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB4_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default ; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v7 ; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 @@ -1188,36 +1334,43 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v8, v15, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v8 ; SDAG-NEXT: v_mov_b32_e32 v1, v9 -; SDAG-NEXT: .LBB4_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB4_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB4_8: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_cmp_lg_u64 s[10:11], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB4_10 ; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb ; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB4_10: ; %itofp-sw-epilog ; SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; SDAG-NEXT: v_and_or_b32 v0, v4, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_and_b32_e32 v4, 0x4000000, v0 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SDAG-NEXT: s_mov_b64 s[4:5], exec ; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 2 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB4_12 ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 3 ; SDAG-NEXT: v_mov_b32_e32 v2, v6 -; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB4_13: ; %Flow4 +; SDAG-NEXT: .LBB4_12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB4_13: ; %itofp-if-end26 ; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3 ; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 ; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v8 ; SDAG-NEXT: v_or3_b32 v0, v2, v0, v1 ; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SDAG-NEXT: .LBB4_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB4_14: ; %itofp-return ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1226,11 +1379,13 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 -; GISEL-NEXT: s_mov_b32 s4, 0 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], exec ; GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB4_14 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB4_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; GISEL-NEXT: v_xor_b32_e32 v0, v6, v0 @@ -1253,11 +1408,13 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc ; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v5 -; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 ; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v8 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr4 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB4_3 ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else ; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -1267,18 +1424,24 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 -; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB4_13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB4_3: ; %Flow3 +; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB4_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock ; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB4_8 +; GISEL-NEXT: s_xor_b64 s[10:11], vcc, exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB4_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB4_7 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB4_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default ; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 ; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4 @@ -1321,37 +1484,44 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: v_mov_b32_e32 v3, v6 -; GISEL-NEXT: .LBB4_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB4_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[10:11] ; GISEL-NEXT: .LBB4_8: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[10:11] +; GISEL-NEXT: s_cbranch_scc0 .LBB4_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB4_10: ; %itofp-sw-epilog ; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB4_12 ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB4_13: ; %Flow4 +; GISEL-NEXT: .LBB4_12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB4_13: ; %itofp-if-end26 ; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 ; GISEL-NEXT: v_lshl_add_u32 v1, v7, 23, 1.0 ; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 ; GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GISEL-NEXT: .LBB4_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB4_14: ; %itofp-return ; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = sitofp i128 %x to half @@ -1365,9 +1535,11 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: s_mov_b64 s[6:7], exec ; SDAG-NEXT: v_mov_b32_e32 v4, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB5_14 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB5_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 ; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 @@ -1381,11 +1553,13 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 ; SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; SDAG-NEXT: v_sub_u32_e32 v5, 0x80, v6 -; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: s_xor_b64 s[4:5], vcc, exec +; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 ; SDAG-NEXT: ; implicit-def: $vgpr7 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB5_3 ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v6 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -1395,18 +1569,24 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB5_13 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB5_3: ; %Flow3 +; SDAG-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB5_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB5_8 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB5_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_cbranch_execz .LBB5_7 +; SDAG-NEXT: s_mov_b64 s[12:13], exec +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB5_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default ; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v6 ; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 @@ -1445,35 +1625,42 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v7, v14, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v7 ; SDAG-NEXT: v_mov_b32_e32 v1, v8 -; SDAG-NEXT: .LBB5_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB5_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB5_8: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_cmp_lg_u64 s[10:11], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB5_10 ; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb ; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB5_10: ; %itofp-sw-epilog ; SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; SDAG-NEXT: v_and_or_b32 v0, v2, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SDAG-NEXT: s_mov_b64 s[4:5], exec ; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 2 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB5_12 ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 3 ; SDAG-NEXT: v_mov_b32_e32 v4, v5 -; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB5_13: ; %Flow4 +; SDAG-NEXT: .LBB5_12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB5_13: ; %itofp-if-end26 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v7 ; SDAG-NEXT: v_lshl_or_b32 v0, v4, 23, v0 ; SDAG-NEXT: v_add_u32_e32 v0, 1.0, v0 ; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SDAG-NEXT: .LBB5_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB5_14: ; %itofp-return ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1482,11 +1669,13 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 -; GISEL-NEXT: s_mov_b32 s4, 0 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], exec ; GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB5_14 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB5_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end ; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 ; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 @@ -1500,11 +1689,13 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc ; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5 -; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 ; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GISEL-NEXT: ; implicit-def: $vgpr4 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB5_3 ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else ; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -1514,18 +1705,24 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 -; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB5_13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB5_3: ; %Flow3 +; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB5_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock ; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB5_8 +; GISEL-NEXT: s_xor_b64 s[10:11], vcc, exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB5_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB5_7 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB5_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default ; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 ; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4 @@ -1568,36 +1765,43 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: v_mov_b32_e32 v3, v6 -; GISEL-NEXT: .LBB5_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB5_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[10:11] ; GISEL-NEXT: .LBB5_8: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[10:11] +; GISEL-NEXT: s_cbranch_scc0 .LBB5_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB5_10: ; %itofp-sw-epilog ; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB5_12 ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v6, v7 -; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB5_13: ; %Flow4 +; GISEL-NEXT: .LBB5_12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB5_13: ; %itofp-if-end26 ; GISEL-NEXT: v_lshl_add_u32 v0, v6, 23, 1.0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GISEL-NEXT: v_and_or_b32 v0, v4, v1, v0 ; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GISEL-NEXT: .LBB5_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB5_14: ; %itofp-return ; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i128 %x to half diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll index 0a70734a65c206..c0b3dc53e5b6b4 100644 --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -13,31 +13,36 @@ define amdgpu_ps void @return_void(float %0) #0 { ; CHECK-LABEL: return_void: ; CHECK: ; %bb.0: ; %main_body -; CHECK-NEXT: s_mov_b64 s[0:1], exec -; CHECK-NEXT: s_mov_b32 s2, 0x41200000 -; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc -; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: s_cbranch_execz .LBB0_3 +; CHECK-NEXT: s_mov_b64 s[2:3], exec +; CHECK-NEXT: s_mov_b32 s0, 0x41200000 +; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 ; CHECK-NEXT: .LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 +; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; CHECK-NEXT: s_cbranch_scc0 .LBB0_7 ; CHECK-NEXT: ; %bb.2: ; %loop ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: s_mov_b64 vcc, 0 ; CHECK-NEXT: s_branch .LBB0_1 -; CHECK-NEXT: .LBB0_3: ; %Flow1 -; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[2:3] -; CHECK-NEXT: s_cbranch_execz .LBB0_5 -; CHECK-NEXT: ; %bb.4: ; %end +; CHECK-NEXT: ; %bb.3: ; %Flow +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: .LBB0_4: ; %Flow1 +; CHECK-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; CHECK-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[0:1] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 +; CHECK-NEXT: ; %bb.5: ; %end ; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: exp mrt0 v1, v1, v1, v0 done vm -; CHECK-NEXT: .LBB0_5: ; %UnifiedReturnBlock +; CHECK-NEXT: .LBB0_6: ; %UnifiedReturnBlock ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: .LBB0_6: +; CHECK-NEXT: .LBB0_7: ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm @@ -57,30 +62,35 @@ end: define amdgpu_ps void @return_void_compr(float %0) #0 { ; CHECK-LABEL: return_void_compr: ; CHECK: ; %bb.0: ; %main_body -; CHECK-NEXT: s_mov_b64 s[0:1], exec -; CHECK-NEXT: s_mov_b32 s2, 0x41200000 -; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc -; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: s_cbranch_execz .LBB1_3 +; CHECK-NEXT: s_mov_b64 s[2:3], exec +; CHECK-NEXT: s_mov_b32 s0, 0x41200000 +; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB1_4 ; CHECK-NEXT: .LBB1_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; CHECK-NEXT: s_cbranch_scc0 .LBB1_6 +; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; CHECK-NEXT: s_cbranch_scc0 .LBB1_7 ; CHECK-NEXT: ; %bb.2: ; %loop ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: s_mov_b64 vcc, 0 ; CHECK-NEXT: s_branch .LBB1_1 -; CHECK-NEXT: .LBB1_3: ; %Flow1 -; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[2:3] -; CHECK-NEXT: s_cbranch_execz .LBB1_5 -; CHECK-NEXT: ; %bb.4: ; %end +; CHECK-NEXT: ; %bb.3: ; %Flow +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: .LBB1_4: ; %Flow1 +; CHECK-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; CHECK-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[0:1] +; CHECK-NEXT: s_cbranch_scc0 .LBB1_6 +; CHECK-NEXT: ; %bb.5: ; %end ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: exp mrt0 v0, off, v0, off done compr vm -; CHECK-NEXT: .LBB1_5: ; %UnifiedReturnBlock +; CHECK-NEXT: .LBB1_6: ; %UnifiedReturnBlock ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: .LBB1_6: +; CHECK-NEXT: .LBB1_7: ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm @@ -132,27 +142,29 @@ define amdgpu_ps float @return_nonvoid(float %0) #0 { ; CHECK-NEXT: s_mov_b64 s[0:1], exec ; CHECK-NEXT: s_mov_b32 s2, 0x41200000 ; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc -; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: s_cbranch_execz .LBB3_3 +; CHECK-NEXT: s_xor_b64 s[2:3], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB3_4 ; CHECK-NEXT: .LBB3_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; CHECK-NEXT: s_cbranch_scc0 .LBB3_4 +; CHECK-NEXT: s_cbranch_scc0 .LBB3_5 ; CHECK-NEXT: ; %bb.2: ; %loop ; CHECK-NEXT: ; in Loop: Header=BB3_1 Depth=1 ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: s_mov_b64 vcc, exec ; CHECK-NEXT: s_cbranch_execnz .LBB3_1 -; CHECK-NEXT: .LBB3_3: ; %Flow1 +; CHECK-NEXT: ; %bb.3: ; %Flow ; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: .LBB3_4: ; %UnifiedReturnBlock ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_branch .LBB3_5 -; CHECK-NEXT: .LBB3_4: +; CHECK-NEXT: s_branch .LBB3_6 +; CHECK-NEXT: .LBB3_5: ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: .LBB3_5: +; CHECK-NEXT: .LBB3_6: main_body: %cmp = fcmp olt float %0, 1.000000e+01 br i1 %cmp, label %end, label %loop diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll index 397502711283e5..1ede5d84d067c7 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -165,10 +165,12 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX8-SDAG-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX8-SDAG-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], s[6:7], exec ; GFX8-SDAG-NEXT: s_mov_b32 m0, -1 -; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_2 +; GFX8-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX8-SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; GFX8-SDAG-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX8-SDAG-NEXT: ; %bb.1: ; %bb1 ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8 @@ -176,18 +178,21 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-SDAG-NEXT: s_trap 2 +; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-SDAG-NEXT: .LBB2_2: ; %Flow -; GFX8-SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX8-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; GFX8-SDAG-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8-SDAG-NEXT: ; %bb.3: ; %bb0 ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8 +; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8 ; GFX8-SDAG-NEXT: ds_write_b32 v0, v0 -; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-SDAG-NEXT: s_trap 2 +; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-SDAG-NEXT: .LBB2_4: ; %ret -; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 2 ; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8 ; GFX8-SDAG-NEXT: ds_write_b32 v0, v0 @@ -202,9 +207,11 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX8-GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX8-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX8-GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8-GISEL-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GFX8-GISEL-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GFX8-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX8-GISEL-NEXT: s_cmov_b64 exec, s[6:7] +; GFX8-GISEL-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX8-GISEL-NEXT: ; %bb.1: ; %bb1 ; GFX8-GISEL-NEXT: s_mov_b64 s[6:7], 0xc8 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 1 @@ -213,19 +220,22 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_trap 2 ; GFX8-GISEL-NEXT: ds_write_b32 v0, v0 +; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-GISEL-NEXT: .LBB2_2: ; %Flow -; GFX8-GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX8-GISEL-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX8-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX8-GISEL-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8-GISEL-NEXT: ; %bb.3: ; %bb0 -; GFX8-GISEL-NEXT: s_mov_b64 s[6:7], 0xc8 +; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], 0xc8 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-GISEL-NEXT: s_mov_b32 m0, -1 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_trap 2 ; GFX8-GISEL-NEXT: ds_write_b32 v0, v0 +; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-GISEL-NEXT: .LBB2_4: ; %ret -; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], 0xc8 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 2 ; GFX8-GISEL-NEXT: s_mov_b32 m0, -1 @@ -242,22 +252,27 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX9-SDAG-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-SDAG-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-SDAG-NEXT: ds_write_b32 v0, v0 ; GFX9-SDAG-NEXT: s_trap 2 +; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-SDAG-NEXT: .LBB2_2: ; %Flow -; GFX9-SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-SDAG-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-SDAG-NEXT: ; %bb.3: ; %bb0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-SDAG-NEXT: ds_write_b32 v0, v0 ; GFX9-SDAG-NEXT: s_trap 2 +; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-SDAG-NEXT: .LBB2_4: ; %ret -; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2 ; GFX9-SDAG-NEXT: ds_write_b32 v0, v0 ; GFX9-SDAG-NEXT: s_trap 2 @@ -270,22 +285,27 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX9-GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-GISEL-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-GISEL-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-GISEL-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-GISEL-NEXT: ; %bb.1: ; %bb1 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-GISEL-NEXT: s_trap 2 ; GFX9-GISEL-NEXT: ds_write_b32 v0, v0 +; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-GISEL-NEXT: .LBB2_2: ; %Flow -; GFX9-GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-GISEL-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-GISEL-NEXT: ; %bb.3: ; %bb0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_trap 2 ; GFX9-GISEL-NEXT: ds_write_b32 v0, v0 +; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-GISEL-NEXT: .LBB2_4: ; %ret -; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2 ; GFX9-GISEL-NEXT: s_trap 2 ; GFX9-GISEL-NEXT: ds_write_b32 v0, v0 @@ -298,29 +318,36 @@ define void @func_uses_lds_multi(i1 %cond) { ; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB2_2 +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_3 ; SDAG-NEXT: ; %bb.1: ; %bb1 ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: ds_write_b32 v0, v0 -; SDAG-NEXT: s_cbranch_execnz .LBB2_6 -; SDAG-NEXT: .LBB2_2: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_4 -; SDAG-NEXT: ; %bb.3: ; %bb0 +; SDAG-NEXT: s_cbranch_execnz .LBB2_8 +; SDAG-NEXT: ; %bb.2: ; %bb1 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB2_3: ; %Flow +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_6 +; SDAG-NEXT: ; %bb.4: ; %bb0 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: ds_write_b32 v0, v0 -; SDAG-NEXT: s_cbranch_execnz .LBB2_6 -; SDAG-NEXT: .LBB2_4: ; %ret -; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: s_cbranch_execnz .LBB2_8 +; SDAG-NEXT: ; %bb.5: ; %bb0 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB2_6: ; %ret ; SDAG-NEXT: v_mov_b32_e32 v0, 2 ; SDAG-NEXT: ds_write_b32 v0, v0 -; SDAG-NEXT: s_cbranch_execnz .LBB2_6 -; SDAG-NEXT: ; %bb.5: ; %ret +; SDAG-NEXT: s_cbranch_execnz .LBB2_8 +; SDAG-NEXT: ; %bb.7: ; %ret ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] -; SDAG-NEXT: .LBB2_6: +; SDAG-NEXT: .LBB2_8: ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: func_uses_lds_multi: @@ -329,24 +356,29 @@ define void @func_uses_lds_multi(i1 %cond) { ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB2_3 +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GISEL-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[6:7] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_3 ; GISEL-NEXT: ; %bb.1: ; %bb1 ; GISEL-NEXT: s_cbranch_execnz .LBB2_8 ; GISEL-NEXT: ; %bb.2: ; %bb1 ; GISEL-NEXT: v_mov_b32_e32 v0, 1 ; GISEL-NEXT: ds_write_b32 v0, v0 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: .LBB2_3: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_6 ; GISEL-NEXT: ; %bb.4: ; %bb0 ; GISEL-NEXT: s_cbranch_execnz .LBB2_8 ; GISEL-NEXT: ; %bb.5: ; %bb0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: ds_write_b32 v0, v0 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB2_6: ; %ret -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: s_cbranch_execnz .LBB2_8 ; GISEL-NEXT: ; %bb.7: ; %ret ; GISEL-NEXT: v_mov_b32_e32 v0, 2 @@ -467,8 +499,10 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX8-SDAG-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX8-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-SDAG-NEXT: s_cbranch_execz .LBB4_2 +; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX8-SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8-SDAG-NEXT: s_cmov_b64 exec, vcc +; GFX8-SDAG-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8-SDAG-NEXT: ; %bb.1: ; %use.bb ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-SDAG-NEXT: s_mov_b32 m0, -1 @@ -479,8 +513,8 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX8-SDAG-NEXT: s_trap 2 ; GFX8-SDAG-NEXT: flat_load_dword v0, v[1:2] glc ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX8-SDAG-NEXT: .LBB4_2: ; %ret ; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-SDAG-NEXT: .LBB4_2: ; %ret ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: func_uses_lds_phi_after: @@ -491,8 +525,10 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX8-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX8-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8-GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8-GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX8-GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8-GISEL-NEXT: ; %bb.1: ; %use.bb ; GFX8-GISEL-NEXT: s_mov_b64 s[6:7], 0xc8 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -503,8 +539,8 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX8-GISEL-NEXT: ds_write_b32 v0, v0 ; GFX8-GISEL-NEXT: flat_load_dword v0, v[1:2] glc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: .LBB4_2: ; %ret ; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-GISEL-NEXT: .LBB4_2: ; %ret ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -516,16 +552,18 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-SDAG-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-SDAG-NEXT: s_cmov_b64 exec, vcc +; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-SDAG-NEXT: ; %bb.1: ; %use.bb ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-SDAG-NEXT: ds_write_b32 v0, v0 ; GFX9-SDAG-NEXT: s_trap 2 ; GFX9-SDAG-NEXT: global_load_dword v0, v[1:2], off glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: .LBB4_2: ; %ret ; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-SDAG-NEXT: .LBB4_2: ; %ret ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -537,16 +575,18 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX9-GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-GISEL-NEXT: ; %bb.1: ; %use.bb ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_trap 2 ; GFX9-GISEL-NEXT: ds_write_b32 v0, v0 ; GFX9-GISEL-NEXT: global_load_dword v0, v[1:2], off glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: .LBB4_2: ; %ret ; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-GISEL-NEXT: .LBB4_2: ; %ret ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -558,8 +598,10 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_and_b32_e32 v3, 1, v3 ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_cbranch_execz .LBB4_3 +; SDAG-NEXT: s_mov_b64 s[4:5], exec +; SDAG-NEXT: s_cmp_lg_u64 vcc, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB4_3 ; SDAG-NEXT: ; %bb.1: ; %use.bb ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: ds_write_b32 v0, v0 @@ -567,8 +609,8 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; SDAG-NEXT: ; %bb.2: ; %use.bb ; SDAG-NEXT: global_load_dword v0, v[1:2], off glc ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: .LBB4_3: ; %ret ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB4_3: ; %ret ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] ; SDAG-NEXT: .LBB4_4: @@ -582,8 +624,10 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v3, 1, v3 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_cbranch_execz .LBB4_3 +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB4_3 ; GISEL-NEXT: ; %bb.1: ; %use.bb ; GISEL-NEXT: s_cbranch_execnz .LBB4_4 ; GISEL-NEXT: ; %bb.2: ; %use.bb @@ -591,8 +635,8 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GISEL-NEXT: ds_write_b32 v0, v0 ; GISEL-NEXT: global_load_dword v0, v[1:2], off glc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: .LBB4_3: ; %ret ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB4_3: ; %ret ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] ; GISEL-NEXT: .LBB4_4: diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll index d23dee1f02f09c..7f09c7c242a578 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll @@ -44,11 +44,10 @@ define half @raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -97,11 +96,10 @@ define <2 x half> @raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -150,11 +148,10 @@ define <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] @@ -207,11 +204,10 @@ define half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -260,11 +256,10 @@ define <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll index bdcb77201714ab..bd80101168e6b9 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll @@ -44,11 +44,10 @@ define float @raw_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset( ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -97,11 +96,10 @@ define <2 x float> @raw_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_ ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] @@ -153,11 +151,10 @@ define <3 x float> @raw_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_ ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -211,11 +208,10 @@ define <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_ ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -272,11 +268,10 @@ define float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset( ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -325,11 +320,10 @@ define <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_ ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll index 28059db0bede3a..73c68e1185e8ed 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll @@ -44,11 +44,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i3 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -98,11 +97,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(<4 x i3 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -152,11 +150,10 @@ define float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i3 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -206,11 +203,10 @@ define float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(<4 x i3 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -260,11 +256,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1) @@ -314,11 +309,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 2, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) @@ -368,11 +362,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 4, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4) @@ -422,11 +415,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_dlc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 6, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6) @@ -476,11 +468,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_dlc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 5, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5) @@ -530,11 +521,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_slc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 7, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7) @@ -584,11 +574,10 @@ define <2 x float> @raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] @@ -640,11 +629,10 @@ define <3 x float> @raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2 @@ -698,11 +686,10 @@ define <4 x float> @raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -758,11 +745,10 @@ define half @raw_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 @@ -812,11 +798,10 @@ define <2 x half> @raw_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset( ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -865,11 +850,10 @@ define <4 x half> @raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset( ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] @@ -921,11 +905,10 @@ define float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -976,11 +959,10 @@ define float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_SBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1031,11 +1013,10 @@ define float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1086,11 +1067,10 @@ define float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_SSHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1142,11 +1122,10 @@ define half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 @@ -1197,11 +1176,10 @@ define float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1251,11 +1229,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) @@ -1303,11 +1280,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) @@ -1356,11 +1332,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) @@ -1409,11 +1384,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 16 @@ -1463,11 +1437,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 @@ -1519,11 +1492,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 4096 @@ -1569,11 +1541,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) @@ -1618,11 +1589,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) @@ -1673,11 +1643,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 16 @@ -1729,11 +1698,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 4095 @@ -1785,11 +1753,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 @@ -1842,11 +1809,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 5000 @@ -1899,11 +1865,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 904, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 5000 diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll index ed5fa05fa8ed30..92692802437472 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll @@ -44,11 +44,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -96,11 +95,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFSET_exact [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -149,11 +147,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -207,11 +204,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -266,11 +262,10 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -315,11 +310,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -364,11 +358,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -417,11 +410,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -471,11 +463,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -527,11 +518,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -590,11 +580,10 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY8]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll index e38de72e1f0f10..8b3a3670f4433e 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll @@ -44,11 +44,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -96,11 +95,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -154,11 +152,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -214,11 +211,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY9]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -276,11 +272,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY10]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -338,11 +333,10 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY10]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -392,11 +386,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY7]], [[COPY]], killed [[REG_SEQUENCE4]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -446,11 +439,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY7]], [[COPY]], killed [[REG_SEQUENCE4]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -504,11 +496,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -563,11 +554,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -624,11 +614,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -691,11 +680,10 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY10]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll index 39c58f8f39d597..53c870cb807c92 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll @@ -45,11 +45,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -99,11 +98,10 @@ define void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -153,11 +151,10 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -207,11 +204,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -261,11 +257,10 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -314,11 +309,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1) ret void @@ -367,11 +361,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 2, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void @@ -420,11 +413,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 3, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3) ret void @@ -473,11 +465,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_d ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 4, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4) ret void @@ -526,11 +517,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 6, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6) ret void @@ -579,11 +569,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 5, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5) ret void @@ -632,11 +621,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 7, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7) ret void @@ -690,11 +678,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -750,11 +737,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY9]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -812,11 +798,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY10]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -865,11 +850,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_BYTE_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %val.trunc = trunc i32 %val to i8 call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -919,11 +903,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_SHORT_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %val.trunc = trunc i32 %val to i16 call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -974,11 +957,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_SHORT_OFFEN_exact [[COPY7]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1027,11 +1009,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1085,11 +1066,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1143,11 +1123,10 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1195,11 +1174,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4095 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET_exact [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -1248,11 +1226,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4096 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) ret void @@ -1301,11 +1278,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1355,11 +1331,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1411,11 +1386,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1461,11 +1435,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -1510,11 +1483,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -1563,11 +1535,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1617,11 +1588,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1673,11 +1643,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1730,11 +1699,10 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_o ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 5000 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1785,11 +1753,10 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_o ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 5000, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll index 5b8bd1f60233b8..e79b946658cef8 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll @@ -54,11 +54,10 @@ define half @raw_ptr_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -117,11 +116,10 @@ define <2 x half> @raw_ptr_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sg ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v2f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -180,11 +178,10 @@ define <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sg ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] @@ -247,11 +244,10 @@ define half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -310,11 +306,10 @@ define <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sg ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll index 7dabd9a3957468..61049b2ff9af5d 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll @@ -54,11 +54,10 @@ define float @raw_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -117,11 +116,10 @@ define <2 x float> @raw_ptr_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] @@ -183,11 +181,10 @@ define <3 x float> @raw_ptr_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s96) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -251,11 +248,10 @@ define <4 x float> @raw_ptr_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -322,11 +318,10 @@ define float @raw_ptr_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -385,11 +380,10 @@ define <4 x float> @raw_ptr_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll index 1a9f7b1619f4ce..14df8287ecf55e 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll @@ -54,11 +54,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -118,11 +117,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -182,11 +180,10 @@ define float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -246,11 +243,10 @@ define float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -310,11 +306,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 1) @@ -374,11 +369,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 2, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) @@ -438,11 +432,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 4, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 4) @@ -502,11 +495,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 6, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 6) @@ -566,11 +558,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 5, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 5) @@ -630,11 +621,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 7, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 7) @@ -694,11 +684,10 @@ define <2 x float> @raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] @@ -760,11 +749,10 @@ define <3 x float> @raw_ptr_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s96) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2 @@ -828,11 +816,10 @@ define <4 x float> @raw_ptr_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -898,11 +885,10 @@ define half @raw_ptr_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 @@ -962,11 +948,10 @@ define <2 x half> @raw_ptr_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1025,11 +1010,10 @@ define <4 x half> @raw_ptr_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] @@ -1091,11 +1075,10 @@ define float @raw_ptr_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8) from %ir.rsrc, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1156,11 +1139,10 @@ define float @raw_ptr_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_SBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8) from %ir.rsrc, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1221,11 +1203,10 @@ define float @raw_ptr_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zex ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1286,11 +1267,10 @@ define float @raw_ptr_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sex ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_SSHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1352,11 +1332,10 @@ define half @raw_ptr_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 @@ -1417,11 +1396,10 @@ define float @raw_ptr_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8) from %ir.rsrc, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1481,11 +1459,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) @@ -1543,11 +1520,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) @@ -1606,11 +1582,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4096, i32 %soffset, i32 0) @@ -1669,11 +1644,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_vof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 16 @@ -1733,11 +1707,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 @@ -1799,11 +1772,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 4096 @@ -1859,11 +1831,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 0) @@ -1918,11 +1889,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 0) @@ -1983,11 +1953,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 16 @@ -2049,11 +2018,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 4095 @@ -2115,11 +2083,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 @@ -2182,11 +2149,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 5000 @@ -2249,11 +2215,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_vof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 904, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 5000 diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll index eada2004161d1e..68a2b3c7c586e6 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll @@ -54,11 +54,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -116,11 +115,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFSET_exact [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.f16(half %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -179,11 +177,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -247,11 +244,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -316,11 +312,10 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -375,11 +370,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -434,11 +428,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -497,11 +490,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -561,11 +553,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -627,11 +618,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -700,11 +690,10 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY12]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll index 60db62dc43a619..bc43b040a386fa 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll @@ -54,11 +54,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -116,11 +115,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.f32(float %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -184,11 +182,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -254,11 +251,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY13]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v3f32(<3 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -326,11 +322,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY14]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -398,11 +393,10 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY14]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -462,11 +456,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY11]], [[COPY]], killed [[REG_SEQUENCE6]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -526,11 +519,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY11]], [[COPY]], killed [[REG_SEQUENCE6]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -594,11 +586,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -663,11 +654,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -734,11 +724,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -811,11 +800,10 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY14]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll index 78e29387b1d40c..71175455373519 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll @@ -55,11 +55,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -119,11 +118,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -183,11 +181,10 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -247,11 +244,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -311,11 +307,10 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -374,11 +369,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 1) ret void @@ -437,11 +431,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 2, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void @@ -500,11 +493,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 3, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 3) ret void @@ -563,11 +555,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 4, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 4) ret void @@ -626,11 +617,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 6, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 6) ret void @@ -689,11 +679,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 5, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 5) ret void @@ -752,11 +741,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 7, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 7) ret void @@ -820,11 +808,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -890,11 +877,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY13]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v3f32(<3 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -962,11 +948,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY14]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1025,11 +1010,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_BYTE_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %val.trunc = trunc i32 %val to i8 call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 %val.trunc, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1089,11 +1073,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_SHORT_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %val.trunc = trunc i32 %val to i16 call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 %val.trunc, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1154,11 +1137,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_SHORT_OFFEN_exact [[COPY11]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1217,11 +1199,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1285,11 +1266,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1353,11 +1333,10 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1415,11 +1394,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET_exact [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -1478,11 +1456,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4096, i32 %soffset, i32 0) ret void @@ -1541,11 +1518,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1605,11 +1581,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1671,11 +1646,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1731,11 +1705,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -1790,11 +1763,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -1853,11 +1825,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1917,11 +1888,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1983,11 +1953,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -2050,11 +2019,10 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 5000 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -2115,11 +2083,10 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 5000, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll index 24dc4f1b3c0aa9..e3620b93ce1da6 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll @@ -53,11 +53,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr add ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -116,11 +115,10 @@ define <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.ptr.tbuffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -179,11 +177,10 @@ define <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] @@ -245,11 +242,10 @@ define half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr add ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -308,11 +304,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) @@ -371,11 +366,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) @@ -434,11 +428,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) @@ -497,11 +490,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll index 01dc0328f2d2d9..6475c5266cd7cf 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll @@ -53,11 +53,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ad ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -116,11 +115,10 @@ define <2 x float> @raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] @@ -182,11 +180,10 @@ define <3 x float> @raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s96) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -250,11 +247,10 @@ define <4 x float> @raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -320,11 +316,10 @@ define float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr ad ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -383,11 +378,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(pt ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) @@ -446,11 +440,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(pt ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) @@ -509,11 +502,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) @@ -572,11 +564,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(pt ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll index cd60d5b21faba0..dd74631415de0c 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll @@ -54,11 +54,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -117,11 +116,10 @@ define void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -185,11 +183,10 @@ define void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -249,11 +246,10 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -313,11 +309,10 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -377,11 +372,10 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -440,11 +434,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ha ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret void @@ -503,11 +496,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ha ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret void @@ -566,11 +558,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret void @@ -629,11 +620,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ha ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret void diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll index 5c67d82c1e977e..3b06dd3b669061 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll @@ -55,11 +55,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -124,11 +123,10 @@ define void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -195,11 +193,10 @@ define void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY13]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s96) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v3f32(<3 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -268,11 +265,10 @@ define void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY14]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -332,11 +328,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -396,11 +391,10 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 1, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 1) ret void @@ -460,11 +454,10 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -524,11 +517,10 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -588,11 +580,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(fl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret void @@ -652,11 +643,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(fl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret void @@ -716,11 +706,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret void @@ -780,11 +769,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(fl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret void @@ -842,11 +830,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 94, i32 0) ret void @@ -904,11 +891,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 94, i32 0) ret void @@ -967,11 +953,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY5]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4096, i32 %soffset, i32 94, i32 0) ret void @@ -1030,11 +1015,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 16, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 16 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1094,11 +1078,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 4095 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1160,11 +1143,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 4096 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1220,11 +1202,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY5]], [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 94, i32 0) ret void @@ -1279,11 +1260,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY5]], [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 94, i32 0) ret void @@ -1344,11 +1324,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 16 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1410,11 +1389,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 4095 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1476,11 +1454,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 4096 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1543,11 +1520,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 5000 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1610,11 +1586,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 904, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 5000 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll index bcffca8a3c4fca..8b93667fcb1b8b 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll @@ -43,11 +43,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i3 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -96,11 +95,10 @@ define <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -149,11 +147,10 @@ define <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] @@ -205,11 +202,10 @@ define half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i3 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -258,11 +254,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) @@ -311,11 +306,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) @@ -364,11 +358,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) @@ -417,11 +410,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll index 51e56a47fc2f25..43ae7666b73f25 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll @@ -43,11 +43,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -96,11 +95,10 @@ define <2 x float> @raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] @@ -152,11 +150,10 @@ define <3 x float> @raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -210,11 +207,10 @@ define <4 x float> @raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -270,11 +266,10 @@ define float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -323,11 +318,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) @@ -376,11 +370,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) @@ -429,11 +422,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) @@ -482,11 +474,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll index a1d8acdb4cc531..0c5f5771912c16 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll @@ -44,11 +44,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -97,11 +96,10 @@ define void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -155,11 +153,10 @@ define void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -209,11 +206,10 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -263,11 +259,10 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -317,11 +312,10 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -370,11 +364,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ha ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret void @@ -423,11 +416,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ha ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret void @@ -476,11 +468,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret void @@ -529,11 +520,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ha ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret void diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll index 7c0aa26a8a6990..91f546cd2eb042 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll @@ -45,11 +45,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -104,11 +103,10 @@ define void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -165,11 +163,10 @@ define void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY9]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -228,11 +225,10 @@ define void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY10]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -282,11 +278,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -336,11 +331,10 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 1) ret void @@ -390,11 +384,10 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -444,11 +437,10 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -498,11 +490,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(fl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret void @@ -552,11 +543,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(fl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret void @@ -606,11 +596,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret void @@ -660,11 +649,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(fl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret void @@ -712,11 +700,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 94, i32 0) ret void @@ -764,11 +751,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 94, i32 0) ret void @@ -817,11 +803,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY5]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 94, i32 0) ret void @@ -870,11 +855,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 16, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 16 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -924,11 +908,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 4095 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -980,11 +963,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 4096 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1030,11 +1012,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY5]], [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 94, i32 0) ret void @@ -1079,11 +1060,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY5]], [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 94, i32 0) ret void @@ -1134,11 +1114,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 16 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1190,11 +1169,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 4095 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1246,11 +1224,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 4096 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1303,11 +1280,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 5000 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1360,11 +1336,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 904, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 5000 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll b/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll index bfd97c53522c9a..f2bd9f82f69430 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll @@ -26,11 +26,10 @@ define float @llvm_amdgcn_raw_buffer_load_f32(i32 %voffset, i32 %soffset) { ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> poison, i32 %voffset, i32 %soffset, i32 0) @@ -60,11 +59,10 @@ define float @llvm_amdgcn_raw_tbuffer_load_f32(i32 %voffset, i32 %soffset) { ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> poison, i32 %voffset, i32 %soffset, i32 0, i32 0) @@ -94,11 +92,10 @@ define <2 x float> @llvm_amdgcn_raw_buffer_load_v2f32(i32 %voffset, i32 %soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY2]] @@ -131,11 +128,10 @@ define <2 x float> @llvm_amdgcn_raw_tbuffer_load_v2f32(i32 %voffset, i32 %soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY2]] @@ -168,11 +164,10 @@ define <3 x float> @llvm_amdgcn_raw_buffer_load_v3f32(i32 %voffset, i32 %soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2 @@ -207,11 +202,10 @@ define <3 x float> @llvm_amdgcn_raw_tbuffer_load_v3f32(i32 %voffset, i32 %soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -246,11 +240,10 @@ define <4 x float> @llvm_amdgcn_raw_buffer_load_v4f32(i32 %voffset, i32 %soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -287,11 +280,10 @@ define <4 x float> @llvm_amdgcn_raw_tbuffer_load_v4f32(i32 %voffset, i32 %soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -329,11 +321,10 @@ define void @llvm_amdgcn_raw_buffer_store_f32(float %val, i32 %voffset, i32 %sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -363,11 +354,10 @@ define void @llvm_amdgcn_raw_tbuffer_store_f32(float %val, i32 %voffset, i32 %so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY2]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -402,11 +392,10 @@ define void @llvm_amdgcn_raw_buffer_store_v2f32(<2 x float> %val, i32 %voffset, ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF2]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -441,11 +430,10 @@ define void @llvm_amdgcn_raw_tbuffer_store_v2f32(<2 x float> %val, i32 %voffset, ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF2]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -482,11 +470,10 @@ define void @llvm_amdgcn_raw_buffer_store_v3f32(<3 x float> %val, i32 %voffset, ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF3]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -523,11 +510,10 @@ define void @llvm_amdgcn_raw_tbuffer_store_v3f32(<3 x float> %val, i32 %voffset, ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF3]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -566,11 +552,10 @@ define void @llvm_amdgcn_raw_buffer_store_v4f32(<4 x float> %val, i32 %voffset, ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF4]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -609,11 +594,10 @@ define void @llvm_amdgcn_raw_tbuffer_store_v4f32(<4 x float> %val, i32 %voffset, ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF4]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -644,11 +628,10 @@ define float @llvm_amdgcn_raw_ptr_buffer_load_f32(i32 %voffset, i32 %soffset) { ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0) @@ -678,11 +661,10 @@ define float @llvm_amdgcn_raw_ptr_tbuffer_load_f32(i32 %voffset, i32 %soffset) { ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0, i32 0) @@ -712,11 +694,10 @@ define <2 x float> @llvm_amdgcn_raw_ptr_buffer_load_v2f32(i32 %voffset, i32 %sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY2]] @@ -749,11 +730,10 @@ define <2 x float> @llvm_amdgcn_raw_ptr_tbuffer_load_v2f32(i32 %voffset, i32 %so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY2]] @@ -786,11 +766,10 @@ define <3 x float> @llvm_amdgcn_raw_ptr_buffer_load_v3f32(i32 %voffset, i32 %sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s96) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2 @@ -825,11 +804,10 @@ define <3 x float> @llvm_amdgcn_raw_ptr_tbuffer_load_v3f32(i32 %voffset, i32 %so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -864,11 +842,10 @@ define <4 x float> @llvm_amdgcn_raw_ptr_buffer_load_v4f32(i32 %voffset, i32 %sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -905,11 +882,10 @@ define <4 x float> @llvm_amdgcn_raw_ptr_tbuffer_load_v4f32(i32 %voffset, i32 %so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -947,11 +923,10 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_f32(float %val, i32 %voffset, i32 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -981,11 +956,10 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_f32(float %val, i32 %voffset, i32 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY2]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -1020,11 +994,10 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v2f32(<2 x float> %val, i32 %voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF2]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1059,11 +1032,10 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v2f32(<2 x float> %val, i32 %voff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF2]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v2f32(<2 x float> %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -1100,11 +1072,10 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v3f32(<3 x float> %val, i32 %voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF3]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v3f32(<3 x float> %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1141,11 +1112,10 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v3f32(<3 x float> %val, i32 %voff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF3]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v3f32(<3 x float> %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -1184,11 +1154,10 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v4f32(<4 x float> %val, i32 %voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF4]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1227,11 +1196,10 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v4f32(<4 x float> %val, i32 %voff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF4]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll index 3781faa54e7dc6..bfcb575aff7b82 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll @@ -132,12 +132,15 @@ endif: define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace(1) %out) { ; GISEL-LABEL: inverse_ballot_branch: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: s_xor_b32 s2, s1, -1 -; GISEL-NEXT: s_and_saveexec_b32 s1, s2 +; GISEL-NEXT: s_xor_b32 s1, s1, -1 +; GISEL-NEXT: s_mov_b32 s2, exec_lo +; GISEL-NEXT: s_and_b32 s1, s1, exec_lo +; GISEL-NEXT: s_cmov_b32 exec_lo, s1 +; GISEL-NEXT: s_cbranch_scc0 .LBB6_2 ; GISEL-NEXT: ; %bb.1: ; %if ; GISEL-NEXT: s_add_i32 s0, s0, 1 -; GISEL-NEXT: ; %bb.2: ; %endif -; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GISEL-NEXT: .LBB6_2: ; %endif ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off ; GISEL-NEXT: s_nop 0 @@ -147,13 +150,16 @@ define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr ; SDAG-LABEL: inverse_ballot_branch: ; SDAG: ; %bb.0: ; %entry ; SDAG-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-NEXT: s_xor_b32 s2, s1, -1 -; SDAG-NEXT: s_and_saveexec_b32 s1, s2 +; SDAG-NEXT: s_xor_b32 s1, s1, -1 +; SDAG-NEXT: s_mov_b32 s2, exec_lo +; SDAG-NEXT: s_and_b32 s1, s1, exec_lo +; SDAG-NEXT: s_cmov_b32 exec_lo, s1 +; SDAG-NEXT: s_cbranch_scc0 .LBB6_2 ; SDAG-NEXT: ; %bb.1: ; %if ; SDAG-NEXT: s_add_i32 s0, s0, 1 ; SDAG-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-NEXT: ; %bb.2: ; %endif -; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; SDAG-NEXT: .LBB6_2: ; %endif ; SDAG-NEXT: global_store_b32 v[0:1], v2, off ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll index 29218a3625216a..a06d0601db7f5f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll @@ -216,13 +216,16 @@ endif: define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace(1) %out) { ; GISEL-LABEL: inverse_ballot_branch: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: s_xor_b64 s[4:5], s[2:3], -1 -; GISEL-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GISEL-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GISEL-NEXT: s_cmov_b64 exec, s[2:3] +; GISEL-NEXT: s_cbranch_scc0 .LBB6_2 ; GISEL-NEXT: ; %bb.1: ; %if ; GISEL-NEXT: s_add_u32 s0, s0, 1 ; GISEL-NEXT: s_addc_u32 s1, s1, 0 -; GISEL-NEXT: ; %bb.2: ; %endif -; GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB6_2: ; %endif ; GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -234,15 +237,18 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr ; SDAG: ; %bb.0: ; %entry ; SDAG-NEXT: v_mov_b32_e32 v3, s1 ; SDAG-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-NEXT: s_xor_b64 s[4:5], s[2:3], -1 -; SDAG-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; SDAG-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; SDAG-NEXT: s_mov_b64 s[4:5], exec +; SDAG-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SDAG-NEXT: s_cmov_b64 exec, s[2:3] +; SDAG-NEXT: s_cbranch_scc0 .LBB6_2 ; SDAG-NEXT: ; %bb.1: ; %if ; SDAG-NEXT: s_add_u32 s0, s0, 1 ; SDAG-NEXT: s_addc_u32 s1, s1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, s1 ; SDAG-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-NEXT: ; %bb.2: ; %endif -; SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB6_2: ; %endif ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll index 419e19083f85e3..3e1eb668ead98d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll @@ -15,9 +15,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -44,9 +45,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB1_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB1_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -72,9 +74,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB2_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB2_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -100,9 +103,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB3_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -131,8 +135,10 @@ define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) { ; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; CHECK-NEXT: s_or_b32 s0, s1, s0 -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; CHECK-NEXT: s_cbranch_execnz .LBB4_1 +; CHECK-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_cselect_b32 exec_lo, s1, s0 +; CHECK-NEXT: s_cbranch_scc1 .LBB4_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -159,9 +165,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB5_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB5_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -189,9 +196,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB6_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -222,9 +230,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { ; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB7_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB7_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -253,9 +262,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB8_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB8_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -285,9 +295,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) { ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB9_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB9_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll index 6541ac9553231c..9dbde4c4223cdb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll @@ -15,9 +15,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) % ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -44,9 +45,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) % ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB1_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB1_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -72,9 +74,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB2_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB2_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -100,9 +103,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) % ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB3_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -131,8 +135,10 @@ define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %p ; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; CHECK-NEXT: s_or_b32 s0, s1, s0 -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; CHECK-NEXT: s_cbranch_execnz .LBB4_1 +; CHECK-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_cselect_b32 exec_lo, s1, s0 +; CHECK-NEXT: s_cbranch_scc1 .LBB4_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -159,9 +165,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB5_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB5_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -189,9 +196,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %pt ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB6_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -222,9 +230,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt ; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB7_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB7_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -253,9 +262,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %pt ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB8_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB8_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -285,9 +295,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr) ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB9_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB9_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll index 320b0b4508b6a5..4b6851af60ff95 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll @@ -86,17 +86,17 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__ ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, s0, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v5, s[4:7], s3 offen offset:128 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr6 ; GFX12-NEXT: ; implicit-def: $vgpr5 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 128 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index 7371d498a70706..10a98f62c54aa1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -536,19 +536,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr0 +; GFX8DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX8DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX8DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 @@ -561,8 +564,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -573,18 +576,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-LABEL: divergent_cfg: ; GFX8GISEL: ; %bb.0: ; %entry ; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX8GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_mov_b32 s6, s4 +; GFX8GISEL-NEXT: s_mov_b32 s6, s0 +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8GISEL-NEXT: .LBB4_2: ; %Flow -; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX8GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, 0 @@ -595,8 +602,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: ; %bb.5: ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: .LBB4_6: ; %endif ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -608,19 +616,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-LABEL: divergent_cfg: ; GFX9DAGISEL: ; %bb.0: ; %entry ; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr0 +; GFX9DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX9DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 @@ -633,8 +644,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -644,18 +655,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-LABEL: divergent_cfg: ; GFX9GISEL: ; %bb.0: ; %entry ; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX9GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_mov_b32 s6, s4 +; GFX9GISEL-NEXT: s_mov_b32 s6, s0 +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9GISEL-NEXT: .LBB4_2: ; %Flow -; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, 0 @@ -666,8 +681,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: ; %bb.5: ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: .LBB4_6: ; %endif ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -678,19 +694,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-LABEL: divergent_cfg: ; GFX1064DAGISEL: ; %bb.0: ; %entry ; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1064DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1064DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 @@ -703,8 +722,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -715,17 +734,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL: ; %bb.0: ; %entry ; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1064GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1064GISEL-NEXT: s_mov_b32 s6, s0 +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1064GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 @@ -736,8 +759,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: ; %bb.5: ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: .LBB4_6: ; %endif ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -748,19 +772,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-LABEL: divergent_cfg: ; GFX1032DAGISEL: ; %bb.0: ; %entry ; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 -; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 -; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032DAGISEL-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032DAGISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, s1, exec_lo +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032DAGISEL-NEXT: s_cmov_b32 exec_lo, s1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 @@ -773,8 +800,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -785,17 +812,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL: ; %bb.0: ; %entry ; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 -; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1032GISEL-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032GISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: s_xor_b32 s1, s4, exec_lo +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: s_cmov_b32 exec_lo, s4 +; GFX1032GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 @@ -806,8 +837,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_max_u32 s0, s0, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: ; %bb.5: ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: .LBB4_6: ; %endif ; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -818,21 +850,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-LABEL: divergent_cfg: ; GFX1164DAGISEL: ; %bb.0: ; %entry ; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec -; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1164DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1164DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 @@ -846,8 +881,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -859,20 +894,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-LABEL: divergent_cfg: ; GFX1164GISEL: ; %bb.0: ; %entry ; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec ; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1164GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1164GISEL-NEXT: s_mov_b32 s6, s0 +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1164GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 @@ -884,8 +924,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: ; %bb.5: ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: .LBB4_6: ; %endif ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -898,21 +939,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-LABEL: divergent_cfg: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132DAGISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, s1, exec_lo +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132DAGISEL-NEXT: s_cmov_b32 exec_lo, s1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 @@ -926,8 +970,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -939,20 +983,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-LABEL: divergent_cfg: ; GFX1132GISEL: ; %bb.0: ; %entry ; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo ; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1132GISEL-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132GISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_xor_b32 s1, s4, exec_lo +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132GISEL-NEXT: s_cmov_b32 exec_lo, s4 +; GFX1132GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 @@ -964,8 +1013,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_max_u32 s0, s0, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: ; %bb.5: ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: .LBB4_6: ; %endif ; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index 60af21524a04a1..cef809d8ec22c7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -537,19 +537,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr0 +; GFX8DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX8DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX8DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, -1 @@ -562,8 +565,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -574,18 +577,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-LABEL: divergent_cfg: ; GFX8GISEL: ; %bb.0: ; %entry ; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX8GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_mov_b32 s6, s4 +; GFX8GISEL-NEXT: s_mov_b32 s6, s0 +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8GISEL-NEXT: .LBB4_2: ; %Flow -; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX8GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, -1 @@ -596,8 +603,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: ; %bb.5: ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: .LBB4_6: ; %endif ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -609,19 +617,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-LABEL: divergent_cfg: ; GFX9DAGISEL: ; %bb.0: ; %entry ; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr0 +; GFX9DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX9DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, -1 @@ -634,8 +645,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -645,18 +656,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-LABEL: divergent_cfg: ; GFX9GISEL: ; %bb.0: ; %entry ; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9GISEL-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX9GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_mov_b32 s6, s4 +; GFX9GISEL-NEXT: s_mov_b32 s6, s0 +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9GISEL-NEXT: .LBB4_2: ; %Flow -; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, -1 @@ -667,8 +682,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: ; %bb.5: ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: .LBB4_6: ; %endif ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -679,19 +695,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-LABEL: divergent_cfg: ; GFX1064DAGISEL: ; %bb.0: ; %entry ; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1064DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1064DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, -1 @@ -704,8 +723,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -716,17 +735,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL: ; %bb.0: ; %entry ; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1064GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1064GISEL-NEXT: s_mov_b32 s6, s0 +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1064GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, -1 @@ -737,8 +760,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: ; %bb.5: ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: .LBB4_6: ; %endif ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -749,19 +773,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-LABEL: divergent_cfg: ; GFX1032DAGISEL: ; %bb.0: ; %entry ; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 -; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 -; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032DAGISEL-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032DAGISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, s1, exec_lo +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032DAGISEL-NEXT: s_cmov_b32 exec_lo, s1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, -1 @@ -774,8 +801,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -786,17 +813,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL: ; %bb.0: ; %entry ; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 -; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1032GISEL-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032GISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: s_xor_b32 s1, s4, exec_lo +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: s_cmov_b32 exec_lo, s4 +; GFX1032GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s0, -1 @@ -807,8 +838,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_min_u32 s0, s0, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: ; %bb.5: ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: .LBB4_6: ; %endif ; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -819,21 +851,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-LABEL: divergent_cfg: ; GFX1164DAGISEL: ; %bb.0: ; %entry ; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec -; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1164DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1164DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1 @@ -847,8 +882,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -860,20 +895,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-LABEL: divergent_cfg: ; GFX1164GISEL: ; %bb.0: ; %entry ; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec ; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1164GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1164GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1164GISEL-NEXT: s_mov_b32 s6, s0 +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1164GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, -1 @@ -885,8 +925,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: ; %bb.5: ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: .LBB4_6: ; %endif ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -899,21 +940,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-LABEL: divergent_cfg: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132DAGISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, s1, exec_lo +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132DAGISEL-NEXT: s_cmov_b32 exec_lo, s1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1 @@ -927,8 +971,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -940,20 +984,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-LABEL: divergent_cfg: ; GFX1132GISEL: ; %bb.0: ; %entry ; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo ; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1132GISEL-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1132GISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_xor_b32 s1, s4, exec_lo +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132GISEL-NEXT: s_cmov_b32 exec_lo, s4 +; GFX1132GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s0, -1 @@ -965,8 +1014,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_min_u32 s0, s0, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: ; %bb.5: ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: .LBB4_6: ; %endif ; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll index 5fb50d7e8589a7..769e8762dc88aa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -176,14 +176,18 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr ; CHECK-LABEL: test_control_flow_0: ; CHECK: ; %bb.0: ; %main_body ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc -; CHECK-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; CHECK-NEXT: s_cbranch_execz .LBB6_2 +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB6_2 ; CHECK-NEXT: ; %bb.1: ; %ELSE ; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: .LBB6_2: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; CHECK-NEXT: s_cbranch_execz .LBB6_4 +; CHECK-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cmov_b64 exec, s[0:1] +; CHECK-NEXT: s_cbranch_scc0 .LBB6_4 ; CHECK-NEXT: ; %bb.3: ; %IF ; CHECK-NEXT: v_mov_b32_e32 v0, s12 ; CHECK-NEXT: v_mov_b32_e32 v1, s13 @@ -192,8 +196,8 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_add_f32_e32 v2, v0, v1 ; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: .LBB6_4: ; %END -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: v_mov_b32_e32 v0, v2 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ; return to shader part epilog @@ -225,9 +229,10 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr ; CHECK-NEXT: s_mov_b64 s[14:15], exec ; CHECK-NEXT: s_wqm_b64 exec, exec ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: s_and_saveexec_b64 s[16:17], vcc -; CHECK-NEXT: s_xor_b64 s[16:17], exec, s[16:17] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 +; CHECK-NEXT: s_xor_b64 s[16:17], vcc, exec +; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB7_2 ; CHECK-NEXT: ; %bb.1: ; %ELSE ; CHECK-NEXT: image_sample v1, v0, s[0:7], s[8:11] dmask:0x1 ; CHECK-NEXT: s_and_saveexec_b64 s[18:19], s[14:15] @@ -236,9 +241,12 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen ; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: s_or_b64 exec, exec, s[16:17] ; CHECK-NEXT: .LBB7_2: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[16:17] -; CHECK-NEXT: s_cbranch_execz .LBB7_4 +; CHECK-NEXT: s_xor_b64 s[0:1], s[16:17], exec +; CHECK-NEXT: s_cmp_lg_u64 s[16:17], 0 +; CHECK-NEXT: s_cmov_b64 exec, s[16:17] +; CHECK-NEXT: s_cbranch_scc0 .LBB7_4 ; CHECK-NEXT: ; %bb.3: ; %IF ; CHECK-NEXT: v_mov_b32_e32 v0, s12 ; CHECK-NEXT: v_mov_b32_e32 v1, s13 @@ -247,8 +255,8 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_add_f32_e32 v2, v0, v1 ; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec -; CHECK-NEXT: .LBB7_4: ; %END ; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: .LBB7_4: ; %END ; CHECK-NEXT: s_and_b64 exec, exec, s[14:15] ; CHECK-NEXT: v_mov_b32_e32 v0, v2 ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll index 2a979976d806c9..392738fa6f8b6f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll @@ -17,9 +17,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %i ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -46,9 +47,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_const_idx(<4 x i32> %ad ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB1_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB1_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -77,9 +79,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB2_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB2_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -108,9 +111,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB3_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -138,9 +142,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB4_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB4_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -171,8 +176,10 @@ define amdgpu_kernel void @struct_nonatomic_buffer_load_i32(<4 x i32> %addr, i32 ; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; CHECK-NEXT: s_or_b32 s0, s1, s0 -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; CHECK-NEXT: s_cbranch_execnz .LBB5_1 +; CHECK-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_cselect_b32 exec_lo, s1, s0 +; CHECK-NEXT: s_cbranch_scc1 .LBB5_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -202,9 +209,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1] ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB6_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -234,9 +242,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB7_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB7_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -269,9 +278,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB8_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB8_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -302,9 +312,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB9_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB9_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -336,9 +347,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %i ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB10_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB10_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll index 0522d5258b9b5f..14f09a5e1ae246 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -21,13 +21,13 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX10-NEXT: s_and_saveexec_b32 s0, s0 ; GFX10-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v6 @@ -49,12 +49,12 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr4 -; GFX9-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9-NEXT: ; %bb.2: -; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v6 @@ -75,12 +75,12 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v6 @@ -103,12 +103,13 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6 @@ -130,12 +131,13 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll index 70296a0a7bec6a..7c6ac1555fd93f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll @@ -17,9 +17,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %p ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -46,9 +47,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_const_idx(ptr addrs ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB1_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB1_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -77,9 +79,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB2_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB2_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -108,9 +111,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace( ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB3_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -138,9 +142,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB4_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB4_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -171,8 +176,10 @@ define amdgpu_kernel void @struct_ptr_nonatomic_buffer_load_i32(ptr addrspace(8) ; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; CHECK-NEXT: s_or_b32 s0, s1, s0 -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; CHECK-NEXT: s_cbranch_execnz .LBB5_1 +; CHECK-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_cselect_b32 exec_lo, s1, s0 +; CHECK-NEXT: s_cbranch_scc1 .LBB5_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -202,9 +209,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1] ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB6_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -234,9 +242,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB7_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB7_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -269,9 +278,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB8_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB8_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -302,9 +312,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB9_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB9_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: @@ -336,9 +347,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %p ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB10_1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB10_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll index 78204dfefc80cc..917405c9103c50 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll @@ -55,17 +55,17 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1200-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1200-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX1200-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1200-NEXT: ; %bb.2: -; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -95,17 +95,17 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen +; GFX1200-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr0 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1200-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1200-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX1200-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1200-NEXT: ; %bb.2: -; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll index 10059960030446..54facdb3e9ce3b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll @@ -209,14 +209,14 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_atomic_add_f32 v0, v[5:6], s[8:11], s12 idxen offen +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX908-NEXT: ; implicit-def: $vgpr7 ; GFX908-NEXT: ; implicit-def: $vgpr0 ; GFX908-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -244,14 +244,14 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[8:11], s12 idxen offen +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -278,14 +278,14 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX940-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s8 idxen offen +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX940-NEXT: ; implicit-def: $vgpr7 ; GFX940-NEXT: ; implicit-def: $vgpr0 ; GFX940-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -311,17 +311,17 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[5:6], s[4:7], s3 idxen offen +; GFX1200-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr0 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1200-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1200-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX1200-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX1200-NEXT: ; %bb.2: -; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -347,14 +347,14 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v[5:6], s[8:11], s12 idxen offen +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX908-NEXT: ; implicit-def: $vgpr7 ; GFX908-NEXT: ; implicit-def: $vgpr0 ; GFX908-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -382,14 +382,14 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[8:11], s12 idxen offen +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -416,14 +416,14 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX940-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s8 idxen offen +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX940-NEXT: ; implicit-def: $vgpr7 ; GFX940-NEXT: ; implicit-def: $vgpr0 ; GFX940-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -449,17 +449,17 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[5:6], s[4:7], s3 idxen offen +; GFX1200-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr0 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1200-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1200-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX1200-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1200-NEXT: ; %bb.2: -; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll index 5f6a67e4660209..916c1d9d8a6c33 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll @@ -174,13 +174,13 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[8:11], s12 idxen offen glc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -208,13 +208,13 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s8 idxen offen sc0 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX940-NEXT: ; implicit-def: $vgpr7 ; GFX940-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -240,17 +240,17 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1200-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1200-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX1200-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX1200-NEXT: ; %bb.2: -; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -283,13 +283,13 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[8:11], s12 idxen offen glc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -317,13 +317,13 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s8 idxen offen sc0 +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX940-NEXT: ; implicit-def: $vgpr7 ; GFX940-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -349,17 +349,17 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1200-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1200-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX1200-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1200-NEXT: ; %bb.2: -; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll index bd803c380e90a5..7d13b316d92581 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll @@ -454,12 +454,12 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -478,12 +478,12 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -502,13 +502,13 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -529,12 +529,13 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX11-NEXT: s_and_saveexec_b32 s1, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_max_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -559,12 +560,13 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_and_saveexec_b32 s1, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -592,13 +594,13 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX6-NEXT: ; implicit-def: $vgpr7 ; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -620,13 +622,13 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: ; implicit-def: $vgpr7 ; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -648,14 +650,14 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s7 idxen offen offset:256 glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: ; implicit-def: $vgpr7 ; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -677,17 +679,17 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_max_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 glc +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX11-NEXT: ; implicit-def: $vgpr7 ; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -713,17 +715,17 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, s0, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll index 4f9bac584a78e4..92bfc437e0946a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll @@ -233,12 +233,12 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -257,12 +257,12 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -290,13 +290,13 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: ; implicit-def: $vgpr8 ; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -318,13 +318,13 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: ; implicit-def: $vgpr8 ; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll index c9b50eddc94eef..346746fa2c1e80 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll @@ -454,12 +454,12 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -478,12 +478,12 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -502,13 +502,13 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -529,12 +529,13 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX11-NEXT: s_and_saveexec_b32 s1, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_min_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -559,12 +560,13 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX12-NEXT: s_and_saveexec_b32 s1, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -592,13 +594,13 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX6-NEXT: ; implicit-def: $vgpr7 ; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -620,13 +622,13 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: ; implicit-def: $vgpr7 ; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -648,14 +650,14 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s7 idxen offen offset:256 glc +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: ; implicit-def: $vgpr7 ; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -677,17 +679,17 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_min_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 glc +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX11-NEXT: ; implicit-def: $vgpr7 ; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -713,17 +715,17 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, s0, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX12-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll index 01bc833d59be79..e6217113cc69c1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll @@ -233,12 +233,12 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -257,12 +257,12 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -290,13 +290,13 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc +; GFX6-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: ; implicit-def: $vgpr8 ; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX6-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -318,13 +318,13 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: ; implicit-def: $vgpr8 ; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll index 46b2516f72f8ea..a92de8b29381c8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll @@ -19,13 +19,13 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX10-NEXT: s_and_saveexec_b32 s0, s0 ; GFX10-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v6 @@ -47,12 +47,12 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr4 -; GFX9-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9-NEXT: ; %bb.2: -; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v6 @@ -73,12 +73,12 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v6 @@ -101,12 +101,13 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index 684ca3aac7c315..13dacedd7f02af 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -159,21 +159,22 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; SI: ; %bb.0: ; %.entry ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; SI-NEXT: s_mov_b64 s[4:5], exec ; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 -; SI-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] -; SI-NEXT: s_xor_b64 s[0:1], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB2_3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; SI-NEXT: s_cmov_b64 exec, s[0:1] +; SI-NEXT: s_cbranch_scc0 .LBB2_3 ; SI-NEXT: ; %bb.1: ; %.demote -; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; SI-NEXT: s_cbranch_scc0 .LBB2_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: .LBB2_3: ; %.continue -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; SI-NEXT: s_endpgm @@ -186,21 +187,22 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cmov_b64 exec, s[0:1] +; GFX9-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; %.demote -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: .LBB2_3: ; %.continue -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX9-NEXT: s_endpgm @@ -213,21 +215,22 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX10-32: ; %bb.0: ; %.entry ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-32-NEXT: s_mov_b32 s2, exec_lo ; GFX10-32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0 -; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s2 -; GFX10-32-NEXT: s_cbranch_execz .LBB2_3 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote -; GFX10-32-NEXT: s_andn2_b32 s1, s1, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s2, s2, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: .LBB2_3: ; %.continue -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-32-NEXT: s_endpgm @@ -240,21 +243,22 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX10-64: ; %bb.0: ; %.entry ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-64-NEXT: s_mov_b64 s[4:5], exec ; GFX10-64-NEXT: s_mov_b64 s[2:3], exec ; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] -; GFX10-64-NEXT: s_xor_b64 s[0:1], exec, s[4:5] -; GFX10-64-NEXT: s_cbranch_execz .LBB2_3 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10-64-NEXT: s_cmov_b64 exec, s[0:1] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote -; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GFX10-64-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: .LBB2_3: ; %.continue -; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-64-NEXT: s_endpgm @@ -287,17 +291,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; SI-NEXT: s_mov_b64 s[12:13], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc -; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; SI-NEXT: s_cbranch_execz .LBB3_3 +; SI-NEXT: s_mov_b64 s[14:15], exec +; SI-NEXT: s_cmp_lg_u64 vcc, 0 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB3_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: .LBB3_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: .LBB3_3: ; %.continue ; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v0 @@ -316,17 +321,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX9-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: .LBB3_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: .LBB3_3: ; %.continue ; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 @@ -345,17 +351,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 -; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 -; GFX10-32-NEXT: s_cbranch_execz .LBB3_3 +; GFX10-32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s14, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10-32-NEXT: .LBB3_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: .LBB3_3: ; %.continue ; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 @@ -374,17 +381,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_mov_b64 s[12:13], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX10-64-NEXT: s_cbranch_execz .LBB3_3 +; GFX10-64-NEXT: s_mov_b64 s[14:15], exec +; GFX10-64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: .LBB3_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: .LBB3_3: ; %.continue ; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 @@ -421,19 +429,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; SI-NEXT: s_mov_b64 s[12:13], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; SI-NEXT: s_mov_b64 s[14:15], exec ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc -; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; SI-NEXT: s_cbranch_execz .LBB4_3 +; SI-NEXT: s_cmp_lg_u64 vcc, 0 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB4_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 .LBB4_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: .LBB4_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: .LBB4_3: ; %.continue ; SI-NEXT: v_add_f32_e32 v0, v0, v0 ; SI-NEXT: s_and_b64 exec, exec, s[12:13] ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf @@ -450,19 +459,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: .LBB4_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: .LBB4_3: ; %.continue ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf @@ -479,19 +489,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_mov_b32 s13, exec_lo ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 -; GFX10-32-NEXT: s_cbranch_execz .LBB4_3 +; GFX10-32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s14, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10-32-NEXT: .LBB4_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: .LBB4_3: ; %.continue ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -508,19 +519,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_mov_b64 s[12:13], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_mov_b64 s[14:15], exec ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX10-64-NEXT: s_cbranch_execz .LBB4_3 +; GFX10-64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: .LBB4_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: .LBB4_3: ; %.continue ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -665,39 +677,40 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB6_3 +; SI-NEXT: s_xor_b64 s[2:3], vcc, exec +; SI-NEXT: s_cmp_lg_u64 vcc, 0 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB6_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB6_7 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[4:5] -; SI-NEXT: .LBB6_3: ; %.continue0 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] -; SI-NEXT: s_mov_b64 s[2:3], s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; SI-NEXT: .LBB6_3: ; %.continue0 +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 -; SI-NEXT: s_nop 0 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_xor_b64 s[4:5], s[0:1], -1 ; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; SI-NEXT: s_and_b64 exec, exec, s[0:1] ; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_or_b64 s[2:3], s[2:3], vcc -; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB6_6 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cmov_b64 exec, s[4:5] +; SI-NEXT: s_cbranch_scc0 .LBB6_6 ; SI-NEXT: ; %bb.4: ; %.demote1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB6_7 ; SI-NEXT: ; %bb.5: ; %.demote1 ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: .LBB6_6: ; %.continue1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: .LBB6_6: ; %.continue1 ; SI-NEXT: v_bfrev_b32_e32 v0, 60 ; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 ; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm @@ -713,39 +726,40 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB6_3: ; %.continue0 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX9-NEXT: .LBB6_3: ; %.continue0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], -1 ; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_6 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX9-NEXT: ; %bb.4: ; %.demote1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX9-NEXT: ; %bb.5: ; %.demote1 ; GFX9-NEXT: s_mov_b64 exec, 0 -; GFX9-NEXT: .LBB6_6: ; %.continue1 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB6_6: ; %.continue1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -761,37 +775,39 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10-32-NEXT: s_cbranch_execz .LBB6_3 +; GFX10-32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10-32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s2, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 -; GFX10-32-NEXT: .LBB6_3: ; %.continue0 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10-32-NEXT: s_mov_b32 s1, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 +; GFX10-32-NEXT: .LBB6_3: ; %.continue0 +; GFX10-32-NEXT: s_mov_b32 s2, s0 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_xor_b32 s1, s0, -1 -; GFX10-32-NEXT: s_or_b32 s1, s1, vcc_lo -; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 -; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 -; GFX10-32-NEXT: s_cbranch_execz .LBB6_6 +; GFX10-32-NEXT: s_xor_b32 s2, s0, -1 +; GFX10-32-NEXT: s_or_b32 s2, s2, vcc_lo +; GFX10-32-NEXT: s_and_b32 s2, s2, exec_lo +; GFX10-32-NEXT: s_cmov_b32 exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX10-32-NEXT: ; %bb.4: ; %.demote1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-32-NEXT: ; %bb.5: ; %.demote1 ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-32-NEXT: .LBB6_6: ; %.continue1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: .LBB6_6: ; %.continue1 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -807,37 +823,39 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-64-NEXT: s_cbranch_execz .LBB6_3 +; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10-64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: .LBB6_3: ; %.continue0 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX10-64-NEXT: .LBB6_3: ; %.continue0 +; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] ; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_xor_b64 s[2:3], s[0:1], -1 -; GFX10-64-NEXT: s_or_b64 s[2:3], s[2:3], vcc -; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; GFX10-64-NEXT: s_cbranch_execz .LBB6_6 +; GFX10-64-NEXT: s_xor_b64 s[4:5], s[0:1], -1 +; GFX10-64-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX10-64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10-64-NEXT: s_cmov_b64 exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX10-64-NEXT: ; %bb.4: ; %.demote1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-64-NEXT: ; %bb.5: ; %.demote1 ; GFX10-64-NEXT: s_mov_b64 exec, 0 -; GFX10-64-NEXT: .LBB6_6: ; %.continue1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: .LBB6_6: ; %.continue1 ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -889,44 +907,45 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB7_3 +; SI-NEXT: s_xor_b64 s[2:3], vcc, exec +; SI-NEXT: s_cmp_lg_u64 vcc, 0 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB7_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[4:5] -; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_branch .LBB7_5 ; SI-NEXT: .LBB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_add_i32 s6, s6, 1 ; SI-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 ; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB7_8 +; SI-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; SI-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; SI-NEXT: s_cbranch_scc0 .LBB7_8 ; SI-NEXT: .LBB7_5: ; %.continue0 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] +; SI-NEXT: s_mov_b64 s[8:9], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[8:9] ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: s_xor_b64 s[4:5], s[0:1], -1 -; SI-NEXT: s_nop 0 +; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1 +; SI-NEXT: s_mov_b64 s[4:5], exec ; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9] -; SI-NEXT: s_cbranch_execz .LBB7_4 +; SI-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cmov_b64 exec, s[8:9] +; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -935,9 +954,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_8: ; %.return -; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_and_b64 exec, exec, s[0:1] ; SI-NEXT: v_bfrev_b32_e32 v0, 60 ; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 @@ -955,44 +974,45 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_branch .LBB7_5 ; GFX9-NEXT: .LBB7_4: ; %.continue1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_add_i32 s6, s6, 1 ; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB7_8 +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc0 .LBB7_8 ; GFX9-NEXT: .LBB7_5: ; %.continue0 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] +; GFX9-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], -1 +; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[8:9] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; GFX9-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX9-NEXT: s_cmov_b64 exec, s[8:9] +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX9-NEXT: ; %bb.6: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -1001,9 +1021,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_branch .LBB7_4 ; GFX9-NEXT: .LBB7_8: ; %.return -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 @@ -1021,41 +1041,43 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: s_mov_b32 s1, 0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 -; GFX10-32-NEXT: s_cbranch_execz .LBB7_3 +; GFX10-32-NEXT: s_xor_b32 s2, vcc_lo, exec_lo +; GFX10-32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 -; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_mov_b32 s2, 0 ; GFX10-32-NEXT: s_branch .LBB7_5 ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-32-NEXT: s_add_i32 s2, s2, 1 ; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1 ; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 -; GFX10-32-NEXT: s_cbranch_execz .LBB7_8 +; GFX10-32-NEXT: s_andn2_b32 s3, exec_lo, s1 +; GFX10-32-NEXT: s_cselect_b32 exec_lo, s3, s1 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_8 ; GFX10-32-NEXT: .LBB7_5: ; %.continue0 ; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-32-NEXT: s_mov_b32 s3, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3 -; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 +; GFX10-32-NEXT: s_mov_b32 s4, s0 +; GFX10-32-NEXT: s_mov_b32 s3, exec_lo +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s4 +; GFX10-32-NEXT: s_xor_b32 s4, s0, -1 ; GFX10-32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_or_b32 s3, s3, vcc_lo -; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 -; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 -; GFX10-32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10-32-NEXT: s_or_b32 s4, s4, vcc_lo +; GFX10-32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-32-NEXT: s_cmov_b32 exec_lo, s4 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10-32-NEXT: ; %bb.6: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo @@ -1064,9 +1086,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-32-NEXT: s_wqm_b32 s4, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-32-NEXT: s_branch .LBB7_4 ; GFX10-32-NEXT: .LBB7_8: ; %.return -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 @@ -1084,41 +1106,43 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: s_mov_b32 s6, 0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-64-NEXT: s_cbranch_execz .LBB7_3 +; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10-64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-64-NEXT: s_branch .LBB7_5 ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: s_add_i32 s6, s6, 1 ; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 ; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: s_cbranch_execz .LBB7_8 +; GFX10-64-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX10-64-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_8 ; GFX10-64-NEXT: .LBB7_5: ; %.continue0 ; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s6, 0, s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], s[0:1], -1 +; GFX10-64-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-64-NEXT: s_mov_b64 s[4:5], exec +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s6, 0, s[8:9] +; GFX10-64-NEXT: s_xor_b64 s[8:9], s[0:1], -1 ; GFX10-64-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[8:9] -; GFX10-64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10-64-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; GFX10-64-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX10-64-NEXT: s_cmov_b64 exec, s[8:9] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10-64-NEXT: ; %bb.6: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -1127,9 +1151,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: s_branch .LBB7_4 ; GFX10-64-NEXT: .LBB7_8: ; %.return -; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 0894d3251423d6..e6a06473cb74fd 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -97,10 +97,10 @@ define float @local_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -119,10 +119,10 @@ define float @local_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB0_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst @@ -213,10 +213,10 @@ define float @local_atomic_fadd_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -236,10 +236,10 @@ define float @local_atomic_fadd_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB1_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fadd ptr addrspace(3) %gep, float 4.0 seq_cst @@ -329,11 +329,11 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_f32: @@ -350,11 +350,11 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB2_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst ret void @@ -443,11 +443,11 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_f32__offset: @@ -465,11 +465,11 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB3_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383 %unused = atomicrmw fadd ptr addrspace(3) %gep, float 4.0 seq_cst @@ -504,11 +504,11 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f64: @@ -537,11 +537,11 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_ret_f64: @@ -562,10 +562,10 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_ret_f64: @@ -593,10 +593,10 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_ret_f64: @@ -616,10 +616,10 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_ret_f64: @@ -639,10 +639,10 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_f64: @@ -662,10 +662,10 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB4_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret double %result @@ -695,11 +695,11 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f64__offset: @@ -728,11 +728,11 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_ret_f64__offset: @@ -753,10 +753,10 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_ret_f64__offset: @@ -784,10 +784,10 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_ret_f64__offset: @@ -807,10 +807,10 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_ret_f64__offset: @@ -830,10 +830,10 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_f64__offset: @@ -853,10 +853,10 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(3) %ptr, i32 8191 %result = atomicrmw fadd ptr addrspace(3) %gep, double 4.0 seq_cst @@ -885,11 +885,11 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] ; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f64: @@ -916,11 +916,11 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] ; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_f64: @@ -940,10 +940,10 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_noret_f64: @@ -969,11 +969,11 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_noret_f64: @@ -991,11 +991,11 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_noret_f64: @@ -1013,11 +1013,11 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_f64: @@ -1035,11 +1035,11 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v2, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret void @@ -1067,11 +1067,11 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] ; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f64__offset: @@ -1098,11 +1098,11 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] ; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_f64__offset: @@ -1122,10 +1122,10 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_noret_f64__offset: @@ -1151,11 +1151,11 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_noret_f64__offset: @@ -1173,11 +1173,11 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_noret_f64__offset: @@ -1195,11 +1195,11 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_f64__offset: @@ -1218,11 +1218,11 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(3) %ptr, i32 8191 %unused = atomicrmw fadd ptr addrspace(3) %gep, double 4.0 seq_cst @@ -1244,66 +1244,66 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v2, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-NEXT: ds_load_b32 v3, v1 +; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v3, v3 +; GFX12-NEXT: v_not_b32_e32 v2, v2 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX12-NEXT: v_add_f16_e32 v2, 4.0, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-NEXT: v_add_f16_e32 v3, 4.0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX940-NEXT: ds_read_b32 v2, v1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX940-NEXT: ds_read_b32 v3, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_and_b32_e32 v0, 24, v3 -; GFX940-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v3, v3 +; GFX940-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX940-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX940-NEXT: v_not_b32_e32 v2, v2 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX940-NEXT: v_add_f16_e32 v2, 4.0, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX940-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX940-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX940-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fadd_ret_f16: @@ -1312,35 +1312,35 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v2, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-NEXT: ds_load_b32 v3, v1 +; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v3, v3 +; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX11-NEXT: v_add_f16_e32 v2, 4.0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-NEXT: v_add_f16_e32 v3, 4.0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_ret_f16: @@ -1367,10 +1367,10 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1378,60 +1378,60 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v2, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX90A-NEXT: ds_read_b32 v3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_and_b32_e32 v0, 24, v3 -; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX90A-NEXT: v_add_f16_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v2, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX908-NEXT: ds_read_b32 v3, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_and_b32_e32 v0, 24, v3 -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX908-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX908-NEXT: v_add_f16_e32 v2, 4.0, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX908-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX908-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_ret_f16: @@ -1459,10 +1459,10 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1492,10 +1492,10 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1526,10 +1526,10 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1574,43 +1574,43 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX940-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX940-NEXT: ds_read_b32 v2, v1 -; GFX940-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX940-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX940-NEXT: ds_read_b32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v3, v0, s0 -; GFX940-NEXT: v_not_b32_e32 v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e64 v2, v1, s0 +; GFX940-NEXT: v_not_b32_e32 v2, v2 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX940-NEXT: v_add_f16_e32 v2, 4.0, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX940-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX940-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX940-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fadd_ret_f16__offset: @@ -1644,11 +1644,11 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1659,124 +1659,124 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: ds_read_b32 v3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v1, v4 -; GFX10-NEXT: v_add_f16_e32 v2, 4.0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX10-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_ret_f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v2, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v0, s4 -; GFX90A-NEXT: v_not_b32_e32 v3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v1, s4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX90A-NEXT: v_add_f16_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX90A-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v2, v1 -; GFX908-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX908-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v0, s4 -; GFX908-NEXT: v_not_b32_e32 v3, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v2, v1, s4 +; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX908-NEXT: v_add_f16_e32 v2, 4.0, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX908-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX908-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_ret_f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffe, v0 -; GFX8-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfffe, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX8-NEXT: v_add_f16_e32 v3, 4.0, v3 ; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_ret_f16__offset: @@ -1806,10 +1806,10 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1841,10 +1841,10 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1888,11 +1888,11 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f16: @@ -1917,11 +1917,11 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fadd_noret_f16: @@ -1953,11 +1953,11 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_f16: @@ -1984,10 +1984,10 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_noret_f16: @@ -2012,11 +2012,11 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_noret_f16: @@ -2041,11 +2041,11 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_noret_f16: @@ -2072,11 +2072,11 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_noret_f16: @@ -2104,11 +2104,11 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_f16: @@ -2136,11 +2136,11 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, half 4.0 seq_cst ret void @@ -2182,11 +2182,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f16__offset: @@ -2212,11 +2212,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fadd_noret_f16__offset: @@ -2249,11 +2249,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_f16__offset: @@ -2281,10 +2281,10 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_noret_f16__offset: @@ -2310,11 +2310,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_noret_f16__offset: @@ -2340,11 +2340,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_noret_f16__offset: @@ -2372,11 +2372,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_noret_f16__offset: @@ -2405,11 +2405,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_f16__offset: @@ -2438,11 +2438,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fadd ptr addrspace(3) %gep, half 4.0 seq_cst @@ -2475,11 +2475,11 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2499,10 +2499,10 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2526,11 +2526,11 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2552,10 +2552,10 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2575,10 +2575,10 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2598,10 +2598,10 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2622,10 +2622,10 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2648,10 +2648,10 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2675,10 +2675,10 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 @@ -2711,11 +2711,11 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f16__offset__align4: @@ -2733,11 +2733,11 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fadd_noret_f16__offset__align4: @@ -2759,11 +2759,11 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_f16__offset__align4: @@ -2784,10 +2784,10 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_noret_f16__offset__align4: @@ -2805,11 +2805,11 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_noret_f16__offset__align4: @@ -2827,11 +2827,11 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_noret_f16__offset__align4: @@ -2850,11 +2850,11 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_noret_f16__offset__align4: @@ -2875,11 +2875,11 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_f16__offset__align4: @@ -2901,11 +2901,11 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fadd ptr addrspace(3) %gep, half 4.0 seq_cst, align 4 @@ -2959,11 +2959,11 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2998,10 +2998,10 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -3043,11 +3043,11 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3080,10 +3080,10 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3116,10 +3116,10 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3152,10 +3152,10 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -3190,10 +3190,10 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3223,10 +3223,10 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3257,10 +3257,10 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3313,24 +3313,24 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX940-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX940-NEXT: ds_read_b32 v3, v1 -; GFX940-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX940-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX940-NEXT: ds_read_b32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v2, v0, s0 +; GFX940-NEXT: v_lshlrev_b32_e64 v2, v1, s0 ; GFX940-NEXT: v_not_b32_e32 v2, v2 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff @@ -3338,7 +3338,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 @@ -3347,17 +3347,17 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fadd_ret_bf16__offset: @@ -3400,11 +3400,11 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3438,23 +3438,23 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_ret_bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v3, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff @@ -3462,36 +3462,36 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v3, v1 -; GFX908-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX908-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX908-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff @@ -3499,44 +3499,44 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_ret_bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffe, v0 -; GFX8-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfffe, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 @@ -3545,17 +3545,17 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_ret_bf16__offset: @@ -3585,10 +3585,10 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3620,10 +3620,10 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3675,11 +3675,11 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_bf16: @@ -3712,11 +3712,11 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fadd_noret_bf16: @@ -3757,11 +3757,11 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_bf16: @@ -3793,10 +3793,10 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_noret_bf16: @@ -3827,11 +3827,11 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_noret_bf16: @@ -3862,11 +3862,11 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_noret_bf16: @@ -3899,11 +3899,11 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_noret_bf16: @@ -3931,11 +3931,11 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_bf16: @@ -3963,11 +3963,11 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, bfloat 4.0 seq_cst ret void @@ -4017,11 +4017,11 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset: @@ -4055,11 +4055,11 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fadd_noret_bf16__offset: @@ -4101,11 +4101,11 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_bf16__offset: @@ -4138,10 +4138,10 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_noret_bf16__offset: @@ -4173,11 +4173,11 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_noret_bf16__offset: @@ -4209,11 +4209,11 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_noret_bf16__offset: @@ -4247,11 +4247,11 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_noret_bf16__offset: @@ -4280,11 +4280,11 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_bf16__offset: @@ -4313,11 +4313,11 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fadd ptr addrspace(3) %gep, bfloat 4.0 seq_cst @@ -4358,11 +4358,11 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4391,10 +4391,10 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4427,11 +4427,11 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4459,10 +4459,10 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4490,10 +4490,10 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4521,10 +4521,10 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4552,10 +4552,10 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4578,10 +4578,10 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4605,10 +4605,10 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 @@ -4649,11 +4649,11 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset__align4: @@ -4680,11 +4680,11 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fadd_noret_bf16__offset__align4: @@ -4715,11 +4715,11 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_bf16__offset__align4: @@ -4746,10 +4746,10 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_noret_bf16__offset__align4: @@ -4775,11 +4775,11 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_noret_bf16__offset__align4: @@ -4805,11 +4805,11 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_noret_bf16__offset__align4: @@ -4835,11 +4835,11 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_noret_bf16__offset__align4: @@ -4860,11 +4860,11 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_bf16__offset__align4: @@ -4886,11 +4886,11 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB19_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fadd ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4 @@ -4940,11 +4940,11 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4964,10 +4964,10 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4985,10 +4985,10 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5006,10 +5006,10 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5030,10 +5030,10 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5058,24 +5058,24 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5100,24 +5100,24 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB20_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 @@ -5164,11 +5164,11 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5188,10 +5188,10 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5209,10 +5209,10 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5230,10 +5230,10 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5254,10 +5254,10 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5282,24 +5282,24 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 offset:65532 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5325,24 +5325,24 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v3, v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v3, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fadd ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -5387,11 +5387,11 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_v2f16: @@ -5410,10 +5410,10 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_noret_v2f16: @@ -5429,11 +5429,11 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_noret_v2f16: @@ -5449,11 +5449,11 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_noret_v2f16: @@ -5472,11 +5472,11 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_noret_v2f16: @@ -5500,24 +5500,24 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_v2f16: @@ -5541,24 +5541,24 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val seq_cst ret void @@ -5602,11 +5602,11 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_v2f16__offset: @@ -5625,10 +5625,10 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_noret_v2f16__offset: @@ -5644,11 +5644,11 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_noret_v2f16__offset: @@ -5664,11 +5664,11 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_noret_v2f16__offset: @@ -5687,11 +5687,11 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_noret_v2f16__offset: @@ -5715,24 +5715,24 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_v2f16__offset: @@ -5757,24 +5757,24 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fadd ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -5845,12 +5845,12 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5886,10 +5886,10 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5925,10 +5925,10 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5964,10 +5964,10 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6005,10 +6005,10 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6041,13 +6041,13 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6080,13 +6080,13 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst @@ -6153,12 +6153,12 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6194,10 +6194,10 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6233,10 +6233,10 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6272,10 +6272,10 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6313,10 +6313,10 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6349,13 +6349,13 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6389,13 +6389,13 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fadd ptr addrspace(3) %gep, <2 x bfloat> %val seq_cst @@ -6461,12 +6461,12 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_v2bf16: @@ -6501,10 +6501,10 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16: @@ -6538,11 +6538,11 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_noret_v2bf16: @@ -6576,11 +6576,11 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_noret_v2bf16: @@ -6616,11 +6616,11 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_noret_v2bf16: @@ -6651,13 +6651,13 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_v2bf16: @@ -6688,13 +6688,13 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst ret void @@ -6759,12 +6759,12 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_v2bf16__ofset: @@ -6799,10 +6799,10 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16__ofset: @@ -6836,11 +6836,11 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_noret_v2bf16__ofset: @@ -6874,11 +6874,11 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fadd_noret_v2bf16__ofset: @@ -6914,11 +6914,11 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fadd_noret_v2bf16__ofset: @@ -6949,13 +6949,13 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_v2bf16__ofset: @@ -6987,13 +6987,13 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fadd ptr addrspace(3) %gep, <2 x bfloat> %val seq_cst @@ -7009,14 +7009,16 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 ; GFX12-NEXT: s_mov_b32 s6, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX12-NEXT: ; implicit-def: $vgpr1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s1, s5, 4 -; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX12-NEXT: s_cbranch_execz .LBB28_2 +; GFX12-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -7028,15 +7030,18 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: .LBB28_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: .LBB28_2: ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s7, exec_lo ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX12-NEXT: s_mov_b32 s6, exec_lo -; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX12-NEXT: s_cbranch_execz .LBB28_4 +; GFX12-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_cmp_lg_u32 s0, 0 +; GFX12-NEXT: s_cmov_b32 exec_lo, s0 +; GFX12-NEXT: s_cbranch_scc0 .LBB28_4 ; GFX12-NEXT: ; %bb.3: ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -7048,8 +7053,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: ds_add_f32 v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: .LBB28_4: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: .LBB28_4: ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_brev_b32 s0, 1 @@ -7073,20 +7078,21 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: ; implicit-def: $vgpr1 -; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execz .LBB28_8 +; GFX12-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX12-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12-NEXT: s_cbranch_scc0 .LBB28_8 ; GFX12-NEXT: ; %bb.7: ; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: .LBB28_8: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: .LBB28_8: ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7101,32 +7107,36 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-LABEL: local_ds_fadd: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[6:7], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_add_i32 s5, s5, 4 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX940-NEXT: ; implicit-def: $vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX940-NEXT: s_cbranch_execz .LBB28_2 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX940-NEXT: s_lshl_b32 s8, s5, 3 -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: .LBB28_2: -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_mov_b64 s[8:9], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v2 +; GFX940-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX940-NEXT: s_mov_b64 s[6:7], exec ; GFX940-NEXT: v_readfirstlane_b32 s10, v1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX940-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB28_4 +; GFX940-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX940-NEXT: s_cmov_b64 exec, s[0:1] +; GFX940-NEXT: s_cbranch_scc0 .LBB28_4 ; GFX940-NEXT: ; %bb.3: ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -7135,8 +7145,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f32 v2, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB28_4: ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: .LBB28_4: ; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX940-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX940-NEXT: v_add_f32_e32 v0, s10, v0 @@ -7161,16 +7171,17 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX940-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX940-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX940-NEXT: ; implicit-def: $vgpr2 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB28_8 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB28_8 ; GFX940-NEXT: ; %bb.7: ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB28_8: ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: .LBB28_8: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX940-NEXT: v_readfirstlane_b32 s2, v2 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 @@ -7186,14 +7197,16 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 ; GFX11-NEXT: s_mov_b32 s6, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s1, s5, 4 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -7204,15 +7217,18 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: .LBB28_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: .LBB28_2: ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_mov_b32 s7, exec_lo ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX11-NEXT: s_mov_b32 s6, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX11-NEXT: s_cbranch_execz .LBB28_4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cmov_b32 exec_lo, s0 +; GFX11-NEXT: s_cbranch_scc0 .LBB28_4 ; GFX11-NEXT: ; %bb.3: ; GFX11-NEXT: s_bcnt1_i32_b32 s0, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -7223,8 +7239,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: ds_add_f32 v2, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: .LBB28_4: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: .LBB28_4: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -7250,19 +7266,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: ; implicit-def: $vgpr2 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_8 +; GFX11-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB28_8 ; GFX11-NEXT: ; %bb.7: ; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: .LBB28_8: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: .LBB28_8: ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7278,13 +7295,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: ; implicit-def: $vgpr1 +; GFX10-NEXT: s_mov_b32 s0, exec_lo ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s1, s5, 4 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB28_2 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 @@ -7294,15 +7313,17 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: .LBB28_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: .LBB28_2: ; GFX10-NEXT: s_mov_b32 s7, exec_lo ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 -; GFX10-NEXT: s_and_saveexec_b32 s6, s0 -; GFX10-NEXT: s_cbranch_execz .LBB28_4 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB28_4 ; GFX10-NEXT: ; %bb.3: ; GFX10-NEXT: s_bcnt1_i32_b32 s0, s7 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -7313,9 +7334,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: ds_add_f32 v2, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: .LBB28_4: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: .LBB28_4: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX10-NEXT: s_mov_b32 s0, exec_lo @@ -7338,18 +7359,19 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: ; implicit-def: $vgpr2 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB28_8 +; GFX10-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB28_8 ; GFX10-NEXT: ; %bb.7: ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: .LBB28_8: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: .LBB28_8: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 null, 0 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 @@ -7363,32 +7385,36 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-LABEL: local_ds_fadd: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_add_i32 s5, s5, 4 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB28_2 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX90A-NEXT: s_lshl_b32 s8, s5, 3 -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX90A-NEXT: .LBB28_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[8:9], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] -; GFX90A-NEXT: s_cbranch_execz .LBB28_4 +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[0:1] +; GFX90A-NEXT: s_cbranch_scc0 .LBB28_4 ; GFX90A-NEXT: ; %bb.3: ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -7397,8 +7423,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f32 v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB28_4: ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB28_4: ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX90A-NEXT: v_add_f32_e32 v0, s10, v0 @@ -7423,16 +7449,17 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX90A-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX90A-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execz .LBB28_8 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB28_8 ; GFX90A-NEXT: ; %bb.7: ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB28_8: ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX90A-NEXT: .LBB28_8: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX90A-NEXT: v_readfirstlane_b32 s2, v2 ; GFX90A-NEXT: v_add_f32_e32 v0, s2, v0 @@ -7446,32 +7473,36 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-LABEL: local_ds_fadd: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX908-NEXT: s_mov_b64 s[0:1], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_add_i32 s5, s5, 4 +; GFX908-NEXT: s_mov_b64 s[0:1], exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX908-NEXT: ; implicit-def: $vgpr1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_cbranch_execz .LBB28_2 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX908-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX908-NEXT: s_lshl_b32 s8, s5, 3 -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX908-NEXT: .LBB28_2: -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[8:9], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v2 +; GFX908-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: v_readfirstlane_b32 s10, v1 -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX908-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] -; GFX908-NEXT: s_cbranch_execz .LBB28_4 +; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[0:1] +; GFX908-NEXT: s_cbranch_scc0 .LBB28_4 ; GFX908-NEXT: ; %bb.3: ; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -7480,8 +7511,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: v_mov_b32_e32 v2, s0 ; GFX908-NEXT: ds_add_f32 v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: .LBB28_4: ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB28_4: ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX908-NEXT: v_add_f32_e32 v0, s10, v0 @@ -7506,16 +7537,17 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX908-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX908-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX908-NEXT: s_cbranch_execz .LBB28_8 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB28_8 ; GFX908-NEXT: ; %bb.7: ; GFX908-NEXT: v_mov_b32_e32 v2, s4 ; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: .LBB28_8: ; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX908-NEXT: .LBB28_8: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX908-NEXT: v_readfirstlane_b32 s2, v2 ; GFX908-NEXT: v_add_f32_e32 v0, s2, v0 @@ -7529,33 +7561,37 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-LABEL: local_ds_fadd: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s5, s5, 4 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX8-NEXT: s_cbranch_execz .LBB28_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX8-NEXT: s_lshl_b32 s8, s5, 3 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: .LBB28_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[8:9], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_readfirstlane_b32 s10, v1 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB28_4 +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_cmov_b64 exec, s[0:1] +; GFX8-NEXT: s_cbranch_scc0 .LBB28_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -7564,8 +7600,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: ds_add_f32 v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB28_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: .LBB28_4: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, s10, v0 @@ -7590,17 +7626,18 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr2 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB28_8 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB28_8 ; GFX8-NEXT: ; %bb.7: ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB28_8: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB28_8: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: v_add_f32_e32 v0, s2, v0 @@ -7621,10 +7658,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s5, s5, 4 +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7-NEXT: ; implicit-def: $vgpr1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX7-NEXT: s_cbranch_execz .LBB28_4 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB28_4 ; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_lshl_b32 s8, s5, 3 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 @@ -7642,19 +7681,21 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_2 +; GFX7-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_2 ; GFX7-NEXT: ; %bb.3: ; %Flow23 -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: .LBB28_4: ; %Flow24 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: .LBB28_4: ; GFX7-NEXT: s_mov_b64 s[8:9], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, s8, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX7-NEXT: s_and_b64 s[10:11], s[0:1], -1 +; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] -; GFX7-NEXT: s_cbranch_execz .LBB28_7 +; GFX7-NEXT: s_cmov_b64 exec, s[0:1] +; GFX7-NEXT: s_cbranch_scc0 .LBB28_8 ; GFX7-NEXT: ; %bb.5: ; GFX7-NEXT: s_lshl_b32 s0, s5, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 @@ -7671,11 +7712,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_6 -; GFX7-NEXT: .LBB28_7: ; %Flow22 +; GFX7-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_6 +; GFX7-NEXT: ; %bb.7: ; %Flow21 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: .LBB28_8: ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX7-NEXT: v_add_f32_e32 v0, s10, v0 @@ -7684,7 +7727,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX7-NEXT: ; implicit-def: $vgpr0 -; GFX7-NEXT: .LBB28_8: ; %ComputeLoop +; GFX7-NEXT: .LBB28_9: ; %ComputeLoop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s5 @@ -7696,21 +7739,22 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_writelane_b32 v0, s8, m0 ; GFX7-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX7-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX7-NEXT: s_cbranch_vccnz .LBB28_8 -; GFX7-NEXT: ; %bb.9: ; %ComputeEnd +; GFX7-NEXT: s_cbranch_vccnz .LBB28_9 +; GFX7-NEXT: ; %bb.10: ; %ComputeEnd ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX7-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX7-NEXT: ; implicit-def: $vgpr2 -; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] -; GFX7-NEXT: s_cbranch_execz .LBB28_13 -; GFX7-NEXT: ; %bb.10: +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB28_14 +; GFX7-NEXT: ; %bb.11: ; GFX7-NEXT: v_mov_b32_e32 v3, s4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v2, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB28_11: ; %atomicrmw.start8 +; GFX7-NEXT: .LBB28_12: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v2 @@ -7719,12 +7763,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX7-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB28_11 -; GFX7-NEXT: ; %bb.12: ; %Flow -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: .LBB28_13: ; %Flow20 +; GFX7-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_12 +; GFX7-NEXT: ; %bb.13: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: .LBB28_14: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7-NEXT: v_add_f32_e32 v0, s4, v0 @@ -7745,10 +7789,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s5, s5, 4 +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX6-NEXT: s_cbranch_execz .LBB28_4 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB28_4 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_lshl_b32 s8, s5, 3 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 @@ -7766,19 +7812,21 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_2 +; GFX6-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB28_2 ; GFX6-NEXT: ; %bb.3: ; %Flow21 -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: .LBB28_4: ; %Flow22 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: .LBB28_4: ; GFX6-NEXT: s_mov_b64 s[8:9], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v2, s8, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v2 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX6-NEXT: s_and_b64 s[10:11], s[0:1], -1 +; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] -; GFX6-NEXT: s_cbranch_execz .LBB28_7 +; GFX6-NEXT: s_cmov_b64 exec, s[0:1] +; GFX6-NEXT: s_cbranch_scc0 .LBB28_8 ; GFX6-NEXT: ; %bb.5: ; GFX6-NEXT: s_lshl_b32 s0, s5, 4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 @@ -7795,11 +7843,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_6 -; GFX6-NEXT: .LBB28_7: ; %Flow20 +; GFX6-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB28_6 +; GFX6-NEXT: ; %bb.7: ; %Flow19 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: .LBB28_8: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX6-NEXT: v_add_f32_e32 v0, s10, v0 @@ -7808,7 +7858,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: .LBB28_8: ; %ComputeLoop +; GFX6-NEXT: .LBB28_9: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 @@ -7820,21 +7870,22 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_writelane_b32 v0, s8, m0 ; GFX6-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX6-NEXT: s_cbranch_vccnz .LBB28_8 -; GFX6-NEXT: ; %bb.9: ; %ComputeEnd +; GFX6-NEXT: s_cbranch_vccnz .LBB28_9 +; GFX6-NEXT: ; %bb.10: ; %ComputeEnd ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX6-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX6-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr2 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] -; GFX6-NEXT: s_cbranch_execz .LBB28_13 -; GFX6-NEXT: ; %bb.10: +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB28_14 +; GFX6-NEXT: ; %bb.11: ; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v2, v3 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB28_11: ; %atomicrmw.start8 +; GFX6-NEXT: .LBB28_12: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v2 @@ -7843,12 +7894,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX6-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB28_11 -; GFX6-NEXT: ; %bb.12: ; %Flow -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: .LBB28_13: ; %Flow18 +; GFX6-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB28_12 +; GFX6-NEXT: ; %bb.13: ; %Flow ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: .LBB28_14: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: v_readfirstlane_b32 s4, v2 ; GFX6-NEXT: v_add_f32_e32 v0, s4, v0 @@ -7876,14 +7927,16 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 ; GFX12-NEXT: s_mov_b32 s6, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX12-NEXT: ; implicit-def: $vgpr1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s1, s5, 4 -; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX12-NEXT: s_cbranch_execz .LBB29_2 +; GFX12-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -7892,16 +7945,19 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 -; GFX12-NEXT: .LBB29_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: .LBB29_2: ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s7, exec_lo ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX12-NEXT: s_mov_b32 s6, exec_lo -; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX12-NEXT: s_cbranch_execz .LBB29_4 +; GFX12-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_cmp_lg_u32 s0, 0 +; GFX12-NEXT: s_cmov_b32 exec_lo, s0 +; GFX12-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX12-NEXT: ; %bb.3: ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -7910,8 +7966,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_f32 v2, v1 -; GFX12-NEXT: .LBB29_4: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: .LBB29_4: ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_brev_b32 s0, 1 @@ -7935,17 +7991,18 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: ; implicit-def: $vgpr1 -; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execz .LBB29_8 +; GFX12-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX12-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX12-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12-NEXT: s_cbranch_scc0 .LBB29_8 ; GFX12-NEXT: ; %bb.7: ; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 -; GFX12-NEXT: .LBB29_8: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: .LBB29_8: ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 @@ -7961,32 +8018,36 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-LABEL: local_ds_fadd_one_as: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[6:7], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_add_i32 s5, s5, 4 +; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX940-NEXT: ; implicit-def: $vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX940-NEXT: s_cbranch_execz .LBB29_2 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX940-NEXT: s_lshl_b32 s8, s5, 3 -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: .LBB29_2: -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_mov_b64 s[8:9], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v2 +; GFX940-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX940-NEXT: s_mov_b64 s[6:7], exec ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_readfirstlane_b32 s10, v1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX940-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB29_4 +; GFX940-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX940-NEXT: s_cmov_b64 exec, s[0:1] +; GFX940-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX940-NEXT: ; %bb.3: ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -7994,8 +8055,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f32 v2, v1 -; GFX940-NEXT: .LBB29_4: ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: .LBB29_4: ; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX940-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX940-NEXT: v_add_f32_e32 v0, s10, v0 @@ -8020,15 +8081,16 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX940-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX940-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX940-NEXT: ; implicit-def: $vgpr2 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB29_8 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB29_8 ; GFX940-NEXT: ; %bb.7: ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX940-NEXT: .LBB29_8: ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: .LBB29_8: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_readfirstlane_b32 s2, v2 @@ -8044,14 +8106,16 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 ; GFX11-NEXT: s_mov_b32 s6, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s1, s5, 4 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -8060,16 +8124,19 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_rtn_f32 v1, v2, v1 -; GFX11-NEXT: .LBB29_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: .LBB29_2: ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_mov_b32 s7, exec_lo ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX11-NEXT: s_mov_b32 s6, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX11-NEXT: s_cbranch_execz .LBB29_4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cmov_b32 exec_lo, s0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX11-NEXT: ; %bb.3: ; GFX11-NEXT: s_bcnt1_i32_b32 s0, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -8078,8 +8145,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_f32 v2, v1 -; GFX11-NEXT: .LBB29_4: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: .LBB29_4: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -8105,17 +8172,18 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: ; implicit-def: $vgpr2 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_8 +; GFX11-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB29_8 ; GFX11-NEXT: ; %bb.7: ; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX11-NEXT: .LBB29_8: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: .LBB29_8: ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 @@ -8131,13 +8199,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: ; implicit-def: $vgpr1 +; GFX10-NEXT: s_mov_b32 s0, exec_lo ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s1, s5, 4 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB29_2 +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 @@ -8145,16 +8215,18 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: ds_add_rtn_f32 v1, v2, v1 -; GFX10-NEXT: .LBB29_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: .LBB29_2: ; GFX10-NEXT: s_mov_b32 s7, exec_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 -; GFX10-NEXT: s_and_saveexec_b32 s6, s0 -; GFX10-NEXT: s_cbranch_execz .LBB29_4 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX10-NEXT: ; %bb.3: ; GFX10-NEXT: s_bcnt1_i32_b32 s0, s7 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -8162,9 +8234,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: ds_add_f32 v2, v1 -; GFX10-NEXT: .LBB29_4: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: .LBB29_4: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX10-NEXT: s_mov_b32 s0, exec_lo @@ -8187,15 +8259,16 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: ; implicit-def: $vgpr2 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB29_8 +; GFX10-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX10-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB29_8 ; GFX10-NEXT: ; %bb.7: ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX10-NEXT: .LBB29_8: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: .LBB29_8: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 @@ -8209,32 +8282,36 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-LABEL: local_ds_fadd_one_as: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_add_i32 s5, s5, 4 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB29_2 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX90A-NEXT: s_lshl_b32 s8, s5, 3 -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX90A-NEXT: .LBB29_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[8:9], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] -; GFX90A-NEXT: s_cbranch_execz .LBB29_4 +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cmov_b64 exec, s[0:1] +; GFX90A-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX90A-NEXT: ; %bb.3: ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -8242,8 +8319,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f32 v2, v1 -; GFX90A-NEXT: .LBB29_4: ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB29_4: ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX90A-NEXT: v_add_f32_e32 v0, s10, v0 @@ -8268,15 +8345,16 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX90A-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX90A-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX90A-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execz .LBB29_8 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB29_8 ; GFX90A-NEXT: ; %bb.7: ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX90A-NEXT: .LBB29_8: ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX90A-NEXT: .LBB29_8: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v2 @@ -8290,32 +8368,36 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-LABEL: local_ds_fadd_one_as: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX908-NEXT: s_mov_b64 s[0:1], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_add_i32 s5, s5, 4 +; GFX908-NEXT: s_mov_b64 s[0:1], exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX908-NEXT: ; implicit-def: $vgpr1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_cbranch_execz .LBB29_2 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX908-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX908-NEXT: s_lshl_b32 s8, s5, 3 -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX908-NEXT: .LBB29_2: -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[8:9], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v2 +; GFX908-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s10, v1 -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX908-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] -; GFX908-NEXT: s_cbranch_execz .LBB29_4 +; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX908-NEXT: s_cmov_b64 exec, s[0:1] +; GFX908-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX908-NEXT: ; %bb.3: ; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -8323,8 +8405,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s0 ; GFX908-NEXT: ds_add_f32 v2, v1 -; GFX908-NEXT: .LBB29_4: ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB29_4: ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX908-NEXT: v_add_f32_e32 v0, s10, v0 @@ -8349,15 +8431,16 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX908-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX908-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX908-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX908-NEXT: s_cbranch_execz .LBB29_8 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB29_8 ; GFX908-NEXT: ; %bb.7: ; GFX908-NEXT: v_mov_b32_e32 v2, s4 ; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX908-NEXT: .LBB29_8: ; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX908-NEXT: .LBB29_8: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s2, v2 @@ -8371,33 +8454,37 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-LABEL: local_ds_fadd_one_as: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s5, s5, 4 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX8-NEXT: s_cbranch_execz .LBB29_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX8-NEXT: s_lshl_b32 s8, s5, 3 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: .LBB29_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[8:9], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s10, v1 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB29_4 +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_cmov_b64 exec, s[0:1] +; GFX8-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -8405,8 +8492,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: ds_add_f32 v2, v1 -; GFX8-NEXT: .LBB29_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: .LBB29_4: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, s10, v0 @@ -8431,16 +8518,17 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX8-NEXT: ; implicit-def: $vgpr2 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB29_8 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB29_8 ; GFX8-NEXT: ; %bb.7: ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX8-NEXT: .LBB29_8: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: .LBB29_8: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 @@ -8461,10 +8549,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s5, s5, 4 +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7-NEXT: ; implicit-def: $vgpr1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX7-NEXT: s_cbranch_execz .LBB29_4 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_lshl_b32 s8, s5, 3 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 @@ -8482,19 +8572,21 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_2 +; GFX7-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] +; GFX7-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB29_2 ; GFX7-NEXT: ; %bb.3: ; %Flow23 -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: .LBB29_4: ; %Flow24 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: .LBB29_4: ; GFX7-NEXT: s_mov_b64 s[8:9], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, s8, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX7-NEXT: s_and_b64 s[10:11], s[0:1], -1 +; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] -; GFX7-NEXT: s_cbranch_execz .LBB29_7 +; GFX7-NEXT: s_cmov_b64 exec, s[0:1] +; GFX7-NEXT: s_cbranch_scc0 .LBB29_8 ; GFX7-NEXT: ; %bb.5: ; GFX7-NEXT: s_lshl_b32 s0, s5, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 @@ -8511,11 +8603,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_6 -; GFX7-NEXT: .LBB29_7: ; %Flow22 +; GFX7-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX7-NEXT: s_cbranch_scc1 .LBB29_6 +; GFX7-NEXT: ; %bb.7: ; %Flow21 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: .LBB29_8: ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX7-NEXT: v_add_f32_e32 v0, s10, v0 @@ -8524,7 +8618,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX7-NEXT: ; implicit-def: $vgpr0 -; GFX7-NEXT: .LBB29_8: ; %ComputeLoop +; GFX7-NEXT: .LBB29_9: ; %ComputeLoop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s5 @@ -8536,21 +8630,22 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_writelane_b32 v0, s8, m0 ; GFX7-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX7-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX7-NEXT: s_cbranch_vccnz .LBB29_8 -; GFX7-NEXT: ; %bb.9: ; %ComputeEnd +; GFX7-NEXT: s_cbranch_vccnz .LBB29_9 +; GFX7-NEXT: ; %bb.10: ; %ComputeEnd ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX7-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX7-NEXT: ; implicit-def: $vgpr2 -; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] -; GFX7-NEXT: s_cbranch_execz .LBB29_13 -; GFX7-NEXT: ; %bb.10: +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB29_14 +; GFX7-NEXT: ; %bb.11: ; GFX7-NEXT: v_mov_b32_e32 v3, s4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v2, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB29_11: ; %atomicrmw.start8 +; GFX7-NEXT: .LBB29_12: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v2 @@ -8559,12 +8654,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX7-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB29_11 -; GFX7-NEXT: ; %bb.12: ; %Flow -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: .LBB29_13: ; %Flow20 +; GFX7-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB29_12 +; GFX7-NEXT: ; %bb.13: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: .LBB29_14: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7-NEXT: v_add_f32_e32 v0, s4, v0 @@ -8585,10 +8680,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s5, s5, 4 +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX6-NEXT: s_cbranch_execz .LBB29_4 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_lshl_b32 s8, s5, 3 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 @@ -8606,19 +8703,21 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_2 +; GFX6-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] +; GFX6-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB29_2 ; GFX6-NEXT: ; %bb.3: ; %Flow21 -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: .LBB29_4: ; %Flow22 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: .LBB29_4: ; GFX6-NEXT: s_mov_b64 s[8:9], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v2, s8, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v2 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX6-NEXT: s_and_b64 s[10:11], s[0:1], -1 +; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] -; GFX6-NEXT: s_cbranch_execz .LBB29_7 +; GFX6-NEXT: s_cmov_b64 exec, s[0:1] +; GFX6-NEXT: s_cbranch_scc0 .LBB29_8 ; GFX6-NEXT: ; %bb.5: ; GFX6-NEXT: s_lshl_b32 s0, s5, 4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 @@ -8635,11 +8734,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_6 -; GFX6-NEXT: .LBB29_7: ; %Flow20 +; GFX6-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX6-NEXT: s_cbranch_scc1 .LBB29_6 +; GFX6-NEXT: ; %bb.7: ; %Flow19 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: .LBB29_8: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX6-NEXT: v_add_f32_e32 v0, s10, v0 @@ -8648,7 +8749,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: .LBB29_8: ; %ComputeLoop +; GFX6-NEXT: .LBB29_9: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 @@ -8660,21 +8761,22 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_writelane_b32 v0, s8, m0 ; GFX6-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX6-NEXT: s_cbranch_vccnz .LBB29_8 -; GFX6-NEXT: ; %bb.9: ; %ComputeEnd +; GFX6-NEXT: s_cbranch_vccnz .LBB29_9 +; GFX6-NEXT: ; %bb.10: ; %ComputeEnd ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX6-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX6-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr2 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] -; GFX6-NEXT: s_cbranch_execz .LBB29_13 -; GFX6-NEXT: ; %bb.10: +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB29_14 +; GFX6-NEXT: ; %bb.11: ; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v2, v3 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB29_11: ; %atomicrmw.start8 +; GFX6-NEXT: .LBB29_12: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v2 @@ -8683,12 +8785,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 ; GFX6-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB29_11 -; GFX6-NEXT: ; %bb.12: ; %Flow -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: .LBB29_13: ; %Flow18 +; GFX6-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB29_12 +; GFX6-NEXT: ; %bb.13: ; %Flow ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: .LBB29_14: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: v_readfirstlane_b32 s4, v2 ; GFX6-NEXT: v_add_f32_e32 v0, s4, v0 @@ -8795,10 +8897,10 @@ define float @local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -8817,10 +8919,10 @@ define float @local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 @@ -8910,11 +9012,11 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode: @@ -8931,11 +9033,11 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 6dec36c316ee31..294e03e6799296 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -817,11 +817,11 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -849,10 +849,10 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -886,11 +886,11 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -900,30 +900,30 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_ret_f16: @@ -950,10 +950,10 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -981,10 +981,10 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1014,10 +1014,10 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1047,10 +1047,10 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1081,10 +1081,10 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1130,44 +1130,44 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX940-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX940-NEXT: ds_read_b32 v3, v1 -; GFX940-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX940-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX940-NEXT: ds_read_b32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v2, v0, s0 +; GFX940-NEXT: v_lshlrev_b32_e64 v2, v1, s0 ; GFX940-NEXT: v_not_b32_e32 v2, v2 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX940-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX940-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX940-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1202,11 +1202,11 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1236,109 +1236,109 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_ret_f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v3, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v3, v1 -; GFX908-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX908-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX908-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX908-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_ret_f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffe, v0 -; GFX8-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfffe, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_max_f16_e32 v3, 4.0, v3 ; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1368,10 +1368,10 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1403,10 +1403,10 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1451,11 +1451,11 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f16: @@ -1481,11 +1481,11 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_f16: @@ -1518,11 +1518,11 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_f16: @@ -1550,10 +1550,10 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_f16: @@ -1579,11 +1579,11 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_f16: @@ -1609,11 +1609,11 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_f16: @@ -1641,11 +1641,11 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_f16: @@ -1673,11 +1673,11 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_f16: @@ -1705,11 +1705,11 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, half 4.0 seq_cst ret void @@ -1753,11 +1753,11 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f16__offset: @@ -1784,11 +1784,11 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_f16__offset: @@ -1823,11 +1823,11 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_f16__offset: @@ -1856,10 +1856,10 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_f16__offset: @@ -1886,11 +1886,11 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_f16__offset: @@ -1917,11 +1917,11 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_f16__offset: @@ -1950,11 +1950,11 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_f16__offset: @@ -1983,11 +1983,11 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_f16__offset: @@ -2016,11 +2016,11 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fmax ptr addrspace(3) %gep, half 4.0 seq_cst @@ -2054,11 +2054,11 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2079,10 +2079,10 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2107,11 +2107,11 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2134,10 +2134,10 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2158,10 +2158,10 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2182,10 +2182,10 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2207,10 +2207,10 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2233,10 +2233,10 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2260,10 +2260,10 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 @@ -2298,11 +2298,11 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f16__offset__align4: @@ -2321,11 +2321,11 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_f16__offset__align4: @@ -2349,11 +2349,11 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_f16__offset__align4: @@ -2375,10 +2375,10 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_f16__offset__align4: @@ -2397,11 +2397,11 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_f16__offset__align4: @@ -2420,11 +2420,11 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_f16__offset__align4: @@ -2444,11 +2444,11 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_f16__offset__align4: @@ -2469,11 +2469,11 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_f16__offset__align4: @@ -2495,11 +2495,11 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fmax ptr addrspace(3) %gep, half 4.0 seq_cst, align 4 @@ -2553,11 +2553,11 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2592,10 +2592,10 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2637,11 +2637,11 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2674,10 +2674,10 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2710,10 +2710,10 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2746,10 +2746,10 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2784,10 +2784,10 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2818,10 +2818,10 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2853,10 +2853,10 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2909,24 +2909,24 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX940-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX940-NEXT: ds_read_b32 v3, v1 -; GFX940-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX940-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX940-NEXT: ds_read_b32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v2, v0, s0 +; GFX940-NEXT: v_lshlrev_b32_e64 v2, v1, s0 ; GFX940-NEXT: v_not_b32_e32 v2, v2 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff @@ -2934,7 +2934,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_max_f32_e32 v3, 4.0, v3 ; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 @@ -2943,17 +2943,17 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_bf16__offset: @@ -2996,11 +2996,11 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3034,23 +3034,23 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_ret_bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v3, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff @@ -3058,36 +3058,36 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3 ; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v3, v1 -; GFX908-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX908-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX908-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff @@ -3095,44 +3095,44 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_max_f32_e32 v3, 4.0, v3 ; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_ret_bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffe, v0 -; GFX8-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfffe, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f32_e32 v3, 4.0, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 @@ -3141,17 +3141,17 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_ret_bf16__offset: @@ -3182,10 +3182,10 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3218,10 +3218,10 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3273,11 +3273,11 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_bf16: @@ -3310,11 +3310,11 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_bf16: @@ -3355,11 +3355,11 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_bf16: @@ -3391,10 +3391,10 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_bf16: @@ -3425,11 +3425,11 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_bf16: @@ -3460,11 +3460,11 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_bf16: @@ -3497,11 +3497,11 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_bf16: @@ -3530,11 +3530,11 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_bf16: @@ -3563,11 +3563,11 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, bfloat 4.0 seq_cst ret void @@ -3617,11 +3617,11 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset: @@ -3655,11 +3655,11 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_bf16__offset: @@ -3701,11 +3701,11 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_bf16__offset: @@ -3738,10 +3738,10 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_bf16__offset: @@ -3773,11 +3773,11 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_bf16__offset: @@ -3809,11 +3809,11 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_bf16__offset: @@ -3847,11 +3847,11 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_bf16__offset: @@ -3881,11 +3881,11 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_bf16__offset: @@ -3915,11 +3915,11 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fmax ptr addrspace(3) %gep, bfloat 4.0 seq_cst @@ -3960,11 +3960,11 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3993,10 +3993,10 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4029,11 +4029,11 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4061,10 +4061,10 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4092,10 +4092,10 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4123,10 +4123,10 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4154,10 +4154,10 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4181,10 +4181,10 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4209,10 +4209,10 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 @@ -4253,11 +4253,11 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset__align4: @@ -4284,11 +4284,11 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_bf16__offset__align4: @@ -4319,11 +4319,11 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_bf16__offset__align4: @@ -4350,10 +4350,10 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_bf16__offset__align4: @@ -4379,11 +4379,11 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_bf16__offset__align4: @@ -4409,11 +4409,11 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_bf16__offset__align4: @@ -4439,11 +4439,11 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_bf16__offset__align4: @@ -4465,11 +4465,11 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_bf16__offset__align4: @@ -4492,11 +4492,11 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB19_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fmax ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4 @@ -4532,11 +4532,11 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4557,10 +4557,10 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4583,11 +4583,11 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4609,10 +4609,10 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4632,10 +4632,10 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4655,10 +4655,10 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4683,10 +4683,10 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4711,24 +4711,24 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4753,24 +4753,24 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v3 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB20_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 @@ -4803,11 +4803,11 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4828,10 +4828,10 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4854,11 +4854,11 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4880,10 +4880,10 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4903,10 +4903,10 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4926,10 +4926,10 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4954,10 +4954,10 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4982,24 +4982,24 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 offset:65532 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5025,24 +5025,24 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v3, v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v3, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmax ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -5074,11 +5074,11 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2f16: @@ -5097,11 +5097,11 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_v2f16: @@ -5123,11 +5123,11 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_v2f16: @@ -5148,10 +5148,10 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_v2f16: @@ -5169,11 +5169,11 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_v2f16: @@ -5191,11 +5191,11 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_v2f16: @@ -5218,11 +5218,11 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_v2f16: @@ -5246,24 +5246,24 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_v2f16: @@ -5287,24 +5287,24 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, <2 x half> %val seq_cst ret void @@ -5335,11 +5335,11 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2f16__offset: @@ -5358,11 +5358,11 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_v2f16__offset: @@ -5384,11 +5384,11 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_v2f16__offset: @@ -5409,10 +5409,10 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_v2f16__offset: @@ -5430,11 +5430,11 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_v2f16__offset: @@ -5452,11 +5452,11 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_v2f16__offset: @@ -5479,11 +5479,11 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_v2f16__offset: @@ -5507,24 +5507,24 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_v2f16__offset: @@ -5549,24 +5549,24 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmax ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -5619,11 +5619,11 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5660,10 +5660,10 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -5705,12 +5705,12 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5746,10 +5746,10 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5785,10 +5785,10 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5824,10 +5824,10 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5865,10 +5865,10 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5901,13 +5901,13 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5940,13 +5940,13 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst @@ -5995,11 +5995,11 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6036,10 +6036,10 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -6081,12 +6081,12 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6122,10 +6122,10 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6161,10 +6161,10 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6200,10 +6200,10 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6241,10 +6241,10 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6277,13 +6277,13 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6317,13 +6317,13 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmax ptr addrspace(3) %gep, <2 x bfloat> %val seq_cst @@ -6371,11 +6371,11 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16: @@ -6410,11 +6410,11 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_v2bf16: @@ -6454,12 +6454,12 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_v2bf16: @@ -6494,10 +6494,10 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_v2bf16: @@ -6531,11 +6531,11 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_v2bf16: @@ -6569,11 +6569,11 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_v2bf16: @@ -6609,85 +6609,85 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: ds_read_b32 v4, v0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: ds_read_b32 v4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst ret void @@ -6734,11 +6734,11 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16__ofset: @@ -6773,11 +6773,11 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_v2bf16__ofset: @@ -6817,12 +6817,12 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_v2bf16__ofset: @@ -6857,10 +6857,10 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_v2bf16__ofset: @@ -6894,11 +6894,11 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_v2bf16__ofset: @@ -6932,11 +6932,11 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_v2bf16__ofset: @@ -6972,48 +6972,48 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: ds_read_b32 v4, v0 offset:65532 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_v2bf16__ofset: @@ -7021,37 +7021,37 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: ds_read_b32 v4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmax ptr addrspace(3) %gep, <2 x bfloat> %val seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index b3132a2fa80dd2..0c78bfd1edc7f3 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -817,11 +817,11 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -849,10 +849,10 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -886,11 +886,11 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -900,30 +900,30 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_ret_f16: @@ -950,10 +950,10 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -981,10 +981,10 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1014,10 +1014,10 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1047,10 +1047,10 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1081,10 +1081,10 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1130,44 +1130,44 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX940-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX940-NEXT: ds_read_b32 v3, v1 -; GFX940-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX940-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX940-NEXT: ds_read_b32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v2, v0, s0 +; GFX940-NEXT: v_lshlrev_b32_e64 v2, v1, s0 ; GFX940-NEXT: v_not_b32_e32 v2, v2 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX940-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX940-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX940-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1202,11 +1202,11 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1236,109 +1236,109 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_ret_f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v3, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v3, v1 -; GFX908-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX908-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX908-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX908-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_ret_f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffe, v0 -; GFX8-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfffe, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_min_f16_e32 v3, 4.0, v3 ; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1368,10 +1368,10 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1403,10 +1403,10 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1451,11 +1451,11 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f16: @@ -1481,11 +1481,11 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_f16: @@ -1518,11 +1518,11 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_f16: @@ -1550,10 +1550,10 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_f16: @@ -1579,11 +1579,11 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_f16: @@ -1609,11 +1609,11 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_f16: @@ -1641,11 +1641,11 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_f16: @@ -1673,11 +1673,11 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_f16: @@ -1705,11 +1705,11 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, half 4.0 seq_cst ret void @@ -1753,11 +1753,11 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f16__offset: @@ -1784,11 +1784,11 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_f16__offset: @@ -1823,11 +1823,11 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_f16__offset: @@ -1856,10 +1856,10 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_f16__offset: @@ -1886,11 +1886,11 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_f16__offset: @@ -1917,11 +1917,11 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_f16__offset: @@ -1950,11 +1950,11 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_f16__offset: @@ -1983,11 +1983,11 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_f16__offset: @@ -2016,11 +2016,11 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fmin ptr addrspace(3) %gep, half 4.0 seq_cst @@ -2054,11 +2054,11 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2079,10 +2079,10 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2107,11 +2107,11 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2134,10 +2134,10 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2158,10 +2158,10 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2182,10 +2182,10 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2207,10 +2207,10 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2233,10 +2233,10 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2260,10 +2260,10 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 @@ -2298,11 +2298,11 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f16__offset__align4: @@ -2321,11 +2321,11 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_f16__offset__align4: @@ -2349,11 +2349,11 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_f16__offset__align4: @@ -2375,10 +2375,10 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_f16__offset__align4: @@ -2397,11 +2397,11 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_f16__offset__align4: @@ -2420,11 +2420,11 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_f16__offset__align4: @@ -2444,11 +2444,11 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_f16__offset__align4: @@ -2469,11 +2469,11 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_f16__offset__align4: @@ -2495,11 +2495,11 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fmin ptr addrspace(3) %gep, half 4.0 seq_cst, align 4 @@ -2553,11 +2553,11 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2592,10 +2592,10 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2637,11 +2637,11 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2674,10 +2674,10 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2710,10 +2710,10 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2746,10 +2746,10 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2784,10 +2784,10 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2818,10 +2818,10 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2853,10 +2853,10 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2909,24 +2909,24 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX940-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX940-NEXT: ds_read_b32 v3, v1 -; GFX940-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX940-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX940-NEXT: ds_read_b32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v2, v0, s0 +; GFX940-NEXT: v_lshlrev_b32_e64 v2, v1, s0 ; GFX940-NEXT: v_not_b32_e32 v2, v2 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff @@ -2934,7 +2934,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_min_f32_e32 v3, 4.0, v3 ; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 @@ -2943,17 +2943,17 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_bf16__offset: @@ -2996,11 +2996,11 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3034,23 +3034,23 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_ret_bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v3, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff @@ -3058,36 +3058,36 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3 ; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v3, v1 -; GFX908-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX908-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX908-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff @@ -3095,44 +3095,44 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_min_f32_e32 v3, 4.0, v3 ; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_ret_bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffe, v0 -; GFX8-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfffe, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f32_e32 v3, 4.0, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 @@ -3141,17 +3141,17 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_ret_bf16__offset: @@ -3182,10 +3182,10 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3218,10 +3218,10 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3273,11 +3273,11 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_bf16: @@ -3310,11 +3310,11 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_bf16: @@ -3355,11 +3355,11 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_bf16: @@ -3391,10 +3391,10 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_bf16: @@ -3425,11 +3425,11 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_bf16: @@ -3460,11 +3460,11 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_bf16: @@ -3497,11 +3497,11 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_bf16: @@ -3530,11 +3530,11 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_bf16: @@ -3563,11 +3563,11 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, bfloat 4.0 seq_cst ret void @@ -3617,11 +3617,11 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset: @@ -3655,11 +3655,11 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_bf16__offset: @@ -3701,11 +3701,11 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_bf16__offset: @@ -3738,10 +3738,10 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_bf16__offset: @@ -3773,11 +3773,11 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_bf16__offset: @@ -3809,11 +3809,11 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_bf16__offset: @@ -3847,11 +3847,11 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_bf16__offset: @@ -3881,11 +3881,11 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_bf16__offset: @@ -3915,11 +3915,11 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fmin ptr addrspace(3) %gep, bfloat 4.0 seq_cst @@ -3960,11 +3960,11 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3993,10 +3993,10 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4029,11 +4029,11 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4061,10 +4061,10 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4092,10 +4092,10 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4123,10 +4123,10 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4154,10 +4154,10 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4181,10 +4181,10 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4209,10 +4209,10 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 @@ -4253,11 +4253,11 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset__align4: @@ -4284,11 +4284,11 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_bf16__offset__align4: @@ -4319,11 +4319,11 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_bf16__offset__align4: @@ -4350,10 +4350,10 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_bf16__offset__align4: @@ -4379,11 +4379,11 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_bf16__offset__align4: @@ -4409,11 +4409,11 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_bf16__offset__align4: @@ -4439,11 +4439,11 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_bf16__offset__align4: @@ -4465,11 +4465,11 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_bf16__offset__align4: @@ -4492,11 +4492,11 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB19_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fmin ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4 @@ -4532,11 +4532,11 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4557,10 +4557,10 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4583,11 +4583,11 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4609,10 +4609,10 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4632,10 +4632,10 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4655,10 +4655,10 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4683,10 +4683,10 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4711,24 +4711,24 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4753,24 +4753,24 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_f32_e32 v5, v5, v3 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB20_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 @@ -4803,11 +4803,11 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4828,10 +4828,10 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4854,11 +4854,11 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4880,10 +4880,10 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4903,10 +4903,10 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4926,10 +4926,10 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -4954,10 +4954,10 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4982,24 +4982,24 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 offset:65532 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5025,24 +5025,24 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v3, v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v3, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmin ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -5074,11 +5074,11 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2f16: @@ -5097,11 +5097,11 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_v2f16: @@ -5123,11 +5123,11 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_v2f16: @@ -5148,10 +5148,10 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_v2f16: @@ -5169,11 +5169,11 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_v2f16: @@ -5191,11 +5191,11 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_v2f16: @@ -5218,11 +5218,11 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_v2f16: @@ -5246,24 +5246,24 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_v2f16: @@ -5287,24 +5287,24 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, <2 x half> %val seq_cst ret void @@ -5335,11 +5335,11 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2f16__offset: @@ -5358,11 +5358,11 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_v2f16__offset: @@ -5384,11 +5384,11 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_v2f16__offset: @@ -5409,10 +5409,10 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_v2f16__offset: @@ -5430,11 +5430,11 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_v2f16__offset: @@ -5452,11 +5452,11 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_v2f16__offset: @@ -5479,11 +5479,11 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_v2f16__offset: @@ -5507,24 +5507,24 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_v2f16__offset: @@ -5549,24 +5549,24 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmin ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -5619,11 +5619,11 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5660,10 +5660,10 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -5705,12 +5705,12 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5746,10 +5746,10 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5785,10 +5785,10 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5824,10 +5824,10 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5865,10 +5865,10 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5901,13 +5901,13 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5940,13 +5940,13 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst @@ -5995,11 +5995,11 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6036,10 +6036,10 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -6081,12 +6081,12 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6122,10 +6122,10 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6161,10 +6161,10 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6200,10 +6200,10 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6241,10 +6241,10 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6277,13 +6277,13 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6317,13 +6317,13 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmin ptr addrspace(3) %gep, <2 x bfloat> %val seq_cst @@ -6371,11 +6371,11 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16: @@ -6410,11 +6410,11 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_v2bf16: @@ -6454,12 +6454,12 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_v2bf16: @@ -6494,10 +6494,10 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_v2bf16: @@ -6531,11 +6531,11 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_v2bf16: @@ -6569,11 +6569,11 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_v2bf16: @@ -6609,85 +6609,85 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: ds_read_b32 v4, v0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: ds_read_b32 v4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst ret void @@ -6734,11 +6734,11 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16__ofset: @@ -6773,11 +6773,11 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_v2bf16__ofset: @@ -6817,12 +6817,12 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_v2bf16__ofset: @@ -6857,10 +6857,10 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_v2bf16__ofset: @@ -6894,11 +6894,11 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_v2bf16__ofset: @@ -6932,11 +6932,11 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_v2bf16__ofset: @@ -6972,48 +6972,48 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: ds_read_b32 v4, v0 offset:65532 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_v2bf16__ofset: @@ -7021,37 +7021,37 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: ds_read_b32 v4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmin ptr addrspace(3) %gep, <2 x bfloat> %val seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 5ebeddd04b2ae8..a437dce0a02c27 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -36,11 +36,11 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -58,10 +58,10 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -82,11 +82,11 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -106,10 +106,10 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -127,10 +127,10 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -148,10 +148,10 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -170,10 +170,10 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -192,10 +192,10 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -214,10 +214,10 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB0_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, float 4.0 seq_cst @@ -247,11 +247,11 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB1_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -269,10 +269,10 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -293,11 +293,11 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB1_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -317,10 +317,10 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -338,10 +338,10 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -359,10 +359,10 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -381,10 +381,10 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -403,10 +403,10 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -426,10 +426,10 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB1_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fsub ptr addrspace(3) %gep, float 4.0 seq_cst @@ -458,11 +458,11 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB2_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f32: @@ -478,11 +478,11 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_f32: @@ -501,11 +501,11 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB2_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_f32: @@ -524,10 +524,10 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_f32: @@ -543,11 +543,11 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_f32: @@ -563,11 +563,11 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_f32: @@ -584,11 +584,11 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_f32: @@ -605,11 +605,11 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_f32: @@ -626,11 +626,11 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB2_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, float 4.0 seq_cst ret void @@ -658,11 +658,11 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f32__offset: @@ -678,11 +678,11 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_f32__offset: @@ -701,11 +701,11 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_f32__offset: @@ -724,10 +724,10 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_f32__offset: @@ -743,11 +743,11 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_f32__offset: @@ -763,11 +763,11 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_f32__offset: @@ -784,11 +784,11 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_f32__offset: @@ -805,11 +805,11 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_f32__offset: @@ -827,11 +827,11 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB3_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383 %unused = atomicrmw fsub ptr addrspace(3) %gep, float 4.0 seq_cst @@ -866,11 +866,11 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f64: @@ -888,10 +888,10 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_ret_f64: @@ -912,11 +912,11 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_ret_f64: @@ -937,10 +937,10 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_ret_f64: @@ -958,10 +958,10 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_f64: @@ -980,10 +980,10 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_ret_f64: @@ -1003,10 +1003,10 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_ret_f64: @@ -1026,10 +1026,10 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_f64: @@ -1049,10 +1049,10 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB4_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, double 4.0 seq_cst ret double %result @@ -1082,11 +1082,11 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f64__offset: @@ -1104,10 +1104,10 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_ret_f64__offset: @@ -1128,11 +1128,11 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_ret_f64__offset: @@ -1153,10 +1153,10 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_ret_f64__offset: @@ -1174,10 +1174,10 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_f64__offset: @@ -1196,10 +1196,10 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_ret_f64__offset: @@ -1219,10 +1219,10 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_ret_f64__offset: @@ -1242,10 +1242,10 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_f64__offset: @@ -1265,10 +1265,10 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(3) %ptr, i32 8191 %result = atomicrmw fsub ptr addrspace(3) %gep, double 4.0 seq_cst @@ -1297,11 +1297,11 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] ; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f64: @@ -1317,11 +1317,11 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_f64: @@ -1340,11 +1340,11 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] ; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_f64: @@ -1364,10 +1364,10 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_f64: @@ -1383,11 +1383,11 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_f64: @@ -1404,11 +1404,11 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_f64: @@ -1426,11 +1426,11 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_f64: @@ -1448,11 +1448,11 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_f64: @@ -1470,11 +1470,11 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v2, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, double 4.0 seq_cst ret void @@ -1502,11 +1502,11 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] ; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f64__offset: @@ -1522,11 +1522,11 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_f64__offset: @@ -1545,11 +1545,11 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] ; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_f64__offset: @@ -1569,10 +1569,10 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_f64__offset: @@ -1588,11 +1588,11 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_f64__offset: @@ -1609,11 +1609,11 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_f64__offset: @@ -1631,11 +1631,11 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_f64__offset: @@ -1653,11 +1653,11 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_f64__offset: @@ -1676,11 +1676,11 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(3) %ptr, i32 8191 %unused = atomicrmw fsub ptr addrspace(3) %gep, double 4.0 seq_cst @@ -1702,66 +1702,66 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v2, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-NEXT: ds_load_b32 v3, v1 +; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v3, v3 +; GFX12-NEXT: v_not_b32_e32 v2, v2 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX12-NEXT: v_add_f16_e32 v2, -4.0, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-NEXT: v_add_f16_e32 v3, -4.0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX940-NEXT: ds_read_b32 v2, v1 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX940-NEXT: ds_read_b32 v3, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_and_b32_e32 v0, 24, v3 -; GFX940-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v3, v3 +; GFX940-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX940-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX940-NEXT: v_not_b32_e32 v2, v2 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX940-NEXT: v_add_f16_e32 v2, -4.0, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX940-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX940-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX940-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_ret_f16: @@ -1770,35 +1770,35 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v2, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-NEXT: ds_load_b32 v3, v1 +; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff ; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v3, v3 +; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX11-NEXT: v_add_f16_e32 v2, -4.0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-NEXT: v_add_f16_e32 v3, -4.0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_ret_f16: @@ -1825,10 +1825,10 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1836,60 +1836,60 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v2, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX90A-NEXT: ds_read_b32 v3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_and_b32_e32 v0, 24, v3 -; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX90A-NEXT: v_add_f16_e32 v2, -4.0, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX90A-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v2, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX908-NEXT: ds_read_b32 v3, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_and_b32_e32 v0, 24, v3 -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v3, v3 +; GFX908-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX908-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX908-NEXT: v_add_f16_e32 v2, -4.0, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX908-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX908-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_ret_f16: @@ -1917,10 +1917,10 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1950,10 +1950,10 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1984,10 +1984,10 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2032,43 +2032,43 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX940-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX940-NEXT: ds_read_b32 v2, v1 -; GFX940-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX940-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX940-NEXT: ds_read_b32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v3, v0, s0 -; GFX940-NEXT: v_not_b32_e32 v3, v3 +; GFX940-NEXT: v_lshlrev_b32_e64 v2, v1, s0 +; GFX940-NEXT: v_not_b32_e32 v2, v2 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX940-NEXT: v_add_f16_e32 v2, -4.0, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX940-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX940-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX940-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_ret_f16__offset: @@ -2102,11 +2102,11 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2117,124 +2117,124 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX10-NEXT: ds_read_b32 v2, v0 +; GFX10-NEXT: ds_read_b32 v3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff -; GFX10-NEXT: v_not_b32_e32 v3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v1, v4 -; GFX10-NEXT: v_add_f16_e32 v2, -4.0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX10-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_ret_f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v2, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v0, s4 -; GFX90A-NEXT: v_not_b32_e32 v3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v1, s4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX90A-NEXT: v_add_f16_e32 v2, -4.0, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX90A-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v2, v1 -; GFX908-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX908-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v0, s4 -; GFX908-NEXT: v_not_b32_e32 v3, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v2, v1, s4 +; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX908-NEXT: v_add_f16_e32 v2, -4.0, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX908-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX908-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_ret_f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffe, v0 -; GFX8-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfffe, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX8-NEXT: v_add_f16_e32 v3, -4.0, v3 ; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_ret_f16__offset: @@ -2264,10 +2264,10 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2299,10 +2299,10 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2346,11 +2346,11 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f16: @@ -2375,11 +2375,11 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_f16: @@ -2411,11 +2411,11 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_f16: @@ -2442,10 +2442,10 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_f16: @@ -2470,11 +2470,11 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_f16: @@ -2499,11 +2499,11 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_f16: @@ -2530,11 +2530,11 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_f16: @@ -2562,11 +2562,11 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_f16: @@ -2594,11 +2594,11 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, half 4.0 seq_cst ret void @@ -2640,11 +2640,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f16__offset: @@ -2670,11 +2670,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_f16__offset: @@ -2707,11 +2707,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_f16__offset: @@ -2739,10 +2739,10 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_f16__offset: @@ -2768,11 +2768,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_f16__offset: @@ -2798,11 +2798,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_f16__offset: @@ -2830,11 +2830,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_f16__offset: @@ -2863,11 +2863,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_f16__offset: @@ -2896,11 +2896,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fsub ptr addrspace(3) %gep, half 4.0 seq_cst @@ -2933,11 +2933,11 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2957,10 +2957,10 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2984,11 +2984,11 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3010,10 +3010,10 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3033,10 +3033,10 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3056,10 +3056,10 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -3080,10 +3080,10 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3106,10 +3106,10 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -3133,10 +3133,10 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 @@ -3169,11 +3169,11 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f16__offset__align4: @@ -3191,11 +3191,11 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_f16__offset__align4: @@ -3217,11 +3217,11 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_f16__offset__align4: @@ -3242,10 +3242,10 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_f16__offset__align4: @@ -3263,11 +3263,11 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_f16__offset__align4: @@ -3285,11 +3285,11 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_f16__offset__align4: @@ -3308,11 +3308,11 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_f16__offset__align4: @@ -3333,11 +3333,11 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_f16__offset__align4: @@ -3359,11 +3359,11 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fsub ptr addrspace(3) %gep, half 4.0 seq_cst, align 4 @@ -3417,11 +3417,11 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3456,10 +3456,10 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -3501,11 +3501,11 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3538,10 +3538,10 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3574,10 +3574,10 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3610,10 +3610,10 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -3648,10 +3648,10 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3681,10 +3681,10 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3715,10 +3715,10 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3771,24 +3771,24 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX940-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX940-NEXT: ds_read_b32 v3, v1 -; GFX940-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX940-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX940-NEXT: ds_read_b32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v2, v0, s0 +; GFX940-NEXT: v_lshlrev_b32_e64 v2, v1, s0 ; GFX940-NEXT: v_not_b32_e32 v2, v2 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff @@ -3796,7 +3796,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v3 -; GFX940-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_add_f32_e32 v3, -4.0, v3 ; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 @@ -3805,17 +3805,17 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_ret_bf16__offset: @@ -3858,11 +3858,11 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3896,23 +3896,23 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_ret_bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v3, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff @@ -3920,36 +3920,36 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3 ; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v0, 0xfffe, v0 -; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v3, v1 -; GFX908-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX908-NEXT: v_add_u32_e32 v1, 0xfffe, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX908-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX908-NEXT: v_not_b32_e32 v2, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff @@ -3957,44 +3957,44 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_add_f32_e32 v3, -4.0, v3 ; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_ret_bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffe, v0 -; GFX8-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfffe, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s4 ; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_add_f32_e32 v3, -4.0, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 @@ -4003,17 +4003,17 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_ret_bf16__offset: @@ -4043,10 +4043,10 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4078,10 +4078,10 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4133,11 +4133,11 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_bf16: @@ -4170,11 +4170,11 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_bf16: @@ -4215,11 +4215,11 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_bf16: @@ -4251,10 +4251,10 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_bf16: @@ -4285,11 +4285,11 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_bf16: @@ -4320,11 +4320,11 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_bf16: @@ -4357,11 +4357,11 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_bf16: @@ -4389,11 +4389,11 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_bf16: @@ -4421,11 +4421,11 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, bfloat 4.0 seq_cst ret void @@ -4475,11 +4475,11 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset: @@ -4513,11 +4513,11 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_bf16__offset: @@ -4559,11 +4559,11 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_bf16__offset: @@ -4596,10 +4596,10 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_bf16__offset: @@ -4631,11 +4631,11 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_bf16__offset: @@ -4667,11 +4667,11 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_bf16__offset: @@ -4705,11 +4705,11 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_bf16__offset: @@ -4738,11 +4738,11 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_bf16__offset: @@ -4771,11 +4771,11 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fsub ptr addrspace(3) %gep, bfloat 4.0 seq_cst @@ -4816,11 +4816,11 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4849,10 +4849,10 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -4885,11 +4885,11 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4917,10 +4917,10 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4948,10 +4948,10 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4979,10 +4979,10 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5010,10 +5010,10 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5036,10 +5036,10 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5063,10 +5063,10 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 @@ -5107,11 +5107,11 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset__align4: @@ -5138,11 +5138,11 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 +; GFX940-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_bf16__offset__align4: @@ -5173,11 +5173,11 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_bf16__offset__align4: @@ -5204,10 +5204,10 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_bf16__offset__align4: @@ -5233,11 +5233,11 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_bf16__offset__align4: @@ -5263,11 +5263,11 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_bf16__offset__align4: @@ -5293,11 +5293,11 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_bf16__offset__align4: @@ -5318,11 +5318,11 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_bf16__offset__align4: @@ -5344,11 +5344,11 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB19_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %unused = atomicrmw fsub ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4 @@ -5382,11 +5382,11 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5404,10 +5404,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -5428,11 +5428,11 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5452,10 +5452,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5473,10 +5473,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5494,10 +5494,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5518,10 +5518,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5546,24 +5546,24 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5588,24 +5588,24 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_sub_f32_e32 v5, v5, v3 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB20_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 @@ -5636,11 +5636,11 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5658,10 +5658,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -5682,11 +5682,11 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5706,10 +5706,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -5727,10 +5727,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5748,10 +5748,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -5772,10 +5772,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5800,24 +5800,24 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 offset:65532 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5843,24 +5843,24 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v3, v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v3, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB21_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fsub ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -5889,11 +5889,11 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2f16: @@ -5909,11 +5909,11 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_v2f16: @@ -5932,11 +5932,11 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_v2f16: @@ -5955,10 +5955,10 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_v2f16: @@ -5974,11 +5974,11 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_v2f16: @@ -5994,11 +5994,11 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_v2f16: @@ -6017,11 +6017,11 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_v2f16: @@ -6045,24 +6045,24 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_v2f16: @@ -6086,24 +6086,24 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, <2 x half> %val seq_cst ret void @@ -6131,11 +6131,11 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2f16__offset: @@ -6151,11 +6151,11 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_v2f16__offset: @@ -6174,11 +6174,11 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_v2f16__offset: @@ -6197,10 +6197,10 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_v2f16__offset: @@ -6216,11 +6216,11 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_v2f16__offset: @@ -6236,11 +6236,11 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v2, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_v2f16__offset: @@ -6259,11 +6259,11 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_v2f16__offset: @@ -6287,24 +6287,24 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_v2f16__offset: @@ -6329,24 +6329,24 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fsub ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -6399,11 +6399,11 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6440,10 +6440,10 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -6485,12 +6485,12 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6526,10 +6526,10 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6565,10 +6565,10 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6604,10 +6604,10 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -6645,10 +6645,10 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6681,13 +6681,13 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6720,13 +6720,13 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst @@ -6775,11 +6775,11 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -6816,10 +6816,10 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -6861,12 +6861,12 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6902,10 +6902,10 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6941,10 +6941,10 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6980,10 +6980,10 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -7021,10 +7021,10 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -7057,13 +7057,13 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -7097,13 +7097,13 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fsub ptr addrspace(3) %gep, <2 x bfloat> %val seq_cst @@ -7151,11 +7151,11 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16: @@ -7190,11 +7190,11 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_v2bf16: @@ -7234,12 +7234,12 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_v2bf16: @@ -7274,10 +7274,10 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_v2bf16: @@ -7311,11 +7311,11 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_v2bf16: @@ -7349,11 +7349,11 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_v2bf16: @@ -7389,11 +7389,11 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_v2bf16: @@ -7424,13 +7424,13 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_v2bf16: @@ -7461,13 +7461,13 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst ret void @@ -7514,11 +7514,11 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16__ofset: @@ -7553,11 +7553,11 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX940-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_v2bf16__ofset: @@ -7597,12 +7597,12 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_v2bf16__ofset: @@ -7637,10 +7637,10 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_v2bf16__ofset: @@ -7674,11 +7674,11 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_v2bf16__ofset: @@ -7712,11 +7712,11 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_v2bf16__ofset: @@ -7752,11 +7752,11 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_v2bf16__ofset: @@ -7787,13 +7787,13 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_v2bf16__ofset: @@ -7825,13 +7825,13 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fsub ptr addrspace(3) %gep, <2 x bfloat> %val seq_cst @@ -7865,11 +7865,11 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -7887,10 +7887,10 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -7911,11 +7911,11 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7935,10 +7935,10 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7956,10 +7956,10 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7977,10 +7977,10 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -7999,10 +7999,10 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8021,10 +8021,10 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -8043,10 +8043,10 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 @@ -8075,11 +8075,11 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode: @@ -8095,11 +8095,11 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX940-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode: @@ -8118,11 +8118,11 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode: @@ -8141,10 +8141,10 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode: @@ -8160,11 +8160,11 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode: @@ -8180,11 +8180,11 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode: @@ -8201,11 +8201,11 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode: @@ -8222,11 +8222,11 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode: @@ -8243,11 +8243,11 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX6-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll index 7814eb603e5541..df97ac9316b62a 100644 --- a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll +++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll @@ -150,6 +150,7 @@ bb3: define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: min_long_forward_vbranch: ; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -159,17 +160,18 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_cbranch_execnz .LBB3_1 +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 +; GCN-NEXT: v_addc_u32_e64 v1, s[0:1], 0, v1, s[0:1] +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN-NEXT: ; %bb.3: ; %bb -; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_getpc_b64 s[8:9] ; GCN-NEXT: .Lpost_getpc2: -; GCN-NEXT: s_add_u32 s4, s4, (.LBB3_2-.Lpost_getpc2)&4294967295 -; GCN-NEXT: s_addc_u32 s5, s5, (.LBB3_2-.Lpost_getpc2)>>32 -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s8, s8, (.LBB3_2-.Lpost_getpc2)&4294967295 +; GCN-NEXT: s_addc_u32 s9, s9, (.LBB3_2-.Lpost_getpc2)>>32 +; GCN-NEXT: s_setpc_b64 s[8:9] ; GCN-NEXT: .LBB3_1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; 32 bytes @@ -178,8 +180,8 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB3_2: ; %bb3 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 diff --git a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll index 2d3c03bbe53179..e1dae09d480c15 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll @@ -17,11 +17,11 @@ define <3 x float> @liveout_undef_subrange(<3 x float> %arg) { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_cmp_neq_f32_e32 vcc, 0, v2 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_mul_f32_e32 v2, v3, v2 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 diff --git a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll index 546022b4f9c43d..e5770d262c4564 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll @@ -27,12 +27,12 @@ define void @loop_on_argument(i1 %arg) { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_and_b64 s[6:7], exec, vcc ; CHECK-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; CHECK-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: global_store_dword v[0:1], v0, off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %exit -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll index 5484ba1ed2fe08..5b5440ab543874 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -61,8 +61,9 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 { ; GCN-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB0_1 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB0_1 ; GCN-NEXT: ; %bb.5: ; %bb9 ; GCN-NEXT: s_endpgm bb: @@ -140,10 +141,10 @@ define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB1_1 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB1_1 ; GCN-NEXT: ; %bb.4: ; %bb9 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 7 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v0, v0 @@ -233,10 +234,10 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB2_1 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB2_1 ; GCN-NEXT: ; %bb.4: ; %bb9 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 7 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v0, v0 @@ -323,10 +324,10 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB3_1 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN-NEXT: ; %bb.4: ; %bb9 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 7 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v0, v0 @@ -412,10 +413,10 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB4_1 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB4_1 ; GCN-NEXT: ; %bb.4: ; %bb9 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 7 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v0, v0 @@ -506,10 +507,10 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[8:9] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB5_1 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB5_1 ; GCN-NEXT: ; %bb.4: ; %bb9 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 7 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll index a407cd20bf7624..f2895b674c28ed 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll @@ -15,25 +15,26 @@ define void @needs_and(i32 %arg) { ; GCN-NEXT: s_branch .LBB0_2 ; GCN-NEXT: .LBB0_1: ; %endif ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_and_b64 s[4:5], exec, vcc ; GCN-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GCN-NEXT: s_add_i32 s10, s10, 1 -; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB0_4 +; GCN-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-NEXT: .LBB0_2: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_cmp_gt_u32_e64 s[4:5], s10, v0 +; GCN-NEXT: s_mov_b64 s[8:9], exec ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v0 -; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_1 +; GCN-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB0_1 ; GCN-NEXT: ; %bb.3: ; %then ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_nop 1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4 +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_branch .LBB0_1 ; GCN-NEXT: .LBB0_4: ; %loopexit -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: @@ -71,11 +72,11 @@ define void @doesnt_need_and(i32 %arg) { ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4 -; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB1_1 +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GCN-NEXT: s_cbranch_scc1 .LBB1_1 ; GCN-NEXT: ; %bb.2: ; %loopexit -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: @@ -107,23 +108,25 @@ define void @break_cond_is_arg(i32 %arg, i1 %breakcond) { ; GCN-NEXT: s_branch .LBB2_2 ; GCN-NEXT: .LBB2_1: ; %endif ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] ; GCN-NEXT: s_add_i32 s10, s10, 1 -; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB2_4 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB2_4 ; GCN-NEXT: .LBB2_2: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, s10, v0 -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_cbranch_execz .LBB2_1 +; GCN-NEXT: s_mov_b64 s[8:9], exec +; GCN-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB2_1 ; GCN-NEXT: ; %bb.3: ; %then ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4 +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_branch .LBB2_1 ; GCN-NEXT: .LBB2_4: ; %loopexit -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir index 9eeec4fa3a93d1..25b8ac0aba27c1 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir +++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir @@ -15,32 +15,33 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY]], implicit $exec - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_AND_B32_]], [[COPY1]], implicit-def dead $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_]] - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc + ; CHECK-NEXT: S_CMP_LG_U32_term [[V_CMP_NE_U32_e64_]], 0, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[V_CMP_NE_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, %3, implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, [[S_XOR_B32_]], implicit-def $scc + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[S_OR_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_OR_SAVEEXEC_B32 [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[S_OR_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_B32_1]], implicit-def $scc - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_XOR_B32_]], $exec_lo, implicit-def $scc + ; CHECK-NEXT: S_CMP_LG_U32_term [[S_XOR_B32_]], 0, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[S_XOR_B32_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.4, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, [[S_XOR_B32_1]], implicit-def $scc ; CHECK-NEXT: S_BRANCH %bb.1 bb.0: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -52,11 +53,11 @@ body: | S_BRANCH %bb.2 bb.1: - SI_END_CF killed %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 bb.2: successors: %bb.3(0x80000000) + SI_WAVE_RECONVERGE %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: @@ -68,6 +69,7 @@ body: | bb.4: successors: %bb.1(0x80000000) + SI_WAVE_RECONVERGE killed %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.1 ... @@ -94,12 +96,12 @@ body: | ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[V_CMP_GT_I32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[COPY2]], implicit-def $scc ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_OR_B32_]] - ; CHECK-NEXT: $exec_lo = S_ANDN2_B32_term $exec_lo, [[S_OR_B32_]], implicit-def $scc - ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 $exec_lo, [[S_OR_B32_]], implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CSELECT_B32_term [[S_ANDN2_B32_]], [[S_OR_B32_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_OR_B32_]], implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x80000000) @@ -120,7 +122,6 @@ body: | S_BRANCH %bb.2 bb.2: - SI_END_CF killed %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -137,27 +138,33 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[V_CMP_NGT_F32_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_NGT_F32_e64 0, 0, 0, [[COPY]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[V_CMP_NGT_F32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_]] - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $exec_lo + ; CHECK-NEXT: S_CMP_LG_U32_term [[V_CMP_NGT_F32_e64_]], 0, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[V_CMP_NGT_F32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_CMP_NLT_F32_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_NLT_F32_e64 0, 0, 0, [[COPY]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY2]], [[V_CMP_NLT_F32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_1]] - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $exec_lo + ; CHECK-NEXT: S_CMP_LG_U32_term [[V_CMP_NLT_F32_e64_]], 0, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[V_CMP_NLT_F32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, [[COPY2]], implicit-def $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, [[COPY1]], implicit-def $scc + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY1]], implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x40000000), %bb.4(0x40000000) @@ -177,15 +184,15 @@ body: | bb.2: successors: %bb.3(0x80000000) + SI_WAVE_RECONVERGE killed %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: successors: %bb.4(0x80000000) - SI_END_CF killed %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE killed %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF killed %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -203,11 +210,11 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY]], implicit $exec - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_]] - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $exec_lo + ; CHECK-NEXT: S_CMP_LG_U32_term [[V_CMP_NE_U32_e64_]], 0, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[V_CMP_NE_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.6(0x80000000) @@ -219,11 +226,17 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE %9, %subreg.sub0, %9, %subreg.sub1, %9, %subreg.sub2, %9, %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY %11 ; CHECK-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY6]], [[REG_SEQUENCE]], 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, %12, implicit-def $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, [[COPY1]], implicit-def $scc ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: @@ -232,7 +245,6 @@ body: | ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY1]], implicit-def $scc ; CHECK-NEXT: S_BRANCH %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: @@ -252,17 +264,16 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.7 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7: - ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY2]], 0, implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 0, [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY9]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: dead [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_AND_B32_1]], [[COPY9]], implicit-def dead $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_1]] - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc + ; CHECK-NEXT: S_CMP_LG_U32_term [[V_CMP_EQ_U32_e64_]], 0, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[V_CMP_EQ_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.3 bb.0: successors: %bb.1(0x40000000), %bb.5(0x40000000) liveins: $vgpr0 @@ -288,11 +299,12 @@ body: | %23:sgpr_128 = REG_SEQUENCE killed %19, %subreg.sub0, %19, %subreg.sub1, %19, %subreg.sub2, %19, %subreg.sub3 %24:vgpr_32 = COPY killed %4 BUFFER_ATOMIC_ADD_OFFSET killed %24, killed %23, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + SI_WAVE_RECONVERGE killed %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: successors: %bb.5(0x80000000) - SI_END_CF killed %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE killed %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.5 bb.4: @@ -301,7 +313,6 @@ body: | bb.5: successors: %bb.4(0x80000000) - SI_END_CF killed %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.4 bb.6: diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir index 02e3d7e81fd405..f347b77015a5e2 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir +++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir @@ -5,7 +5,7 @@ # name used for a copy, so some of the check variable names were # manually fixed. -# Check for LiveVariables verifier error after lowering SI_END_CF +# Check for LiveVariables verifier error after lowering SI_WAVE_RECONVERGE --- name: live_variables_update_block_split @@ -21,42 +21,39 @@ body: | ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, killed [[COPY]], implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_MOV_B32_e32_]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY3]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY3]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY $exec + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_MOV_B64_term]] - ; CHECK-NEXT: $exec = S_OR_B64_term $exec, killed [[COPY4]], implicit-def $scc - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]] - ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY5]], implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]] + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY4]], implicit $exec ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] + ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %8:vreg_64, [[COPY5]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_1]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] - ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %10:vreg_64, [[COPY6]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY8]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY8]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]] + ; CHECK-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc + ; CHECK-NEXT: S_BRANCH %bb.1 bb.0: successors: %bb.2(0x40000000), %bb.1(0x40000000) liveins: $vgpr0 @@ -70,19 +67,20 @@ body: | bb.1: successors: %bb.2(0x80000000) - %4:sreg_64_xexec = PHI %5, %bb.2, %3, %bb.0 - %6:vgpr_32 = PHI %7, %bb.2, %1, %bb.0 - SI_END_CF killed %4, implicit-def $exec, implicit-def dead $scc, implicit $exec + %6:vgpr_32 = PHI %7, %bb.21, %1, %bb.0 %8:vgpr_32 = nsw V_ADD_U32_e32 1, killed %6, implicit $exec bb.2: - successors: %bb.2(0x40000000), %bb.1(0x40000000) + successors: %bb.2(0x40000000), %bb.21(0x40000000) %9:vgpr_32 = PHI %8, %bb.1, %7, %bb.2, %1, %bb.0 GLOBAL_STORE_DWORD undef %10:vreg_64, %9, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) %7:vgpr_32 = COPY killed %9 - %5:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %5:sreg_64_xexec = SI_IF %2, %bb.21, implicit-def $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.2 + bb.21: + SI_WAVE_RECONVERGE %3, implicit-def $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 ... @@ -102,48 +100,44 @@ body: | ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, killed [[COPY]], implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_MOV_B32_e32_]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY3]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY3]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY $exec + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_MOV_B64_term]] ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.4(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY5]] - ; CHECK-NEXT: $exec = S_OR_B64_term $exec, killed [[COPY6]], implicit-def $scc - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY4]], implicit $exec ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] + ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %8:vreg_64, [[COPY5]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_1]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.4 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] - ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %11:vreg_64, [[COPY7]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]] - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY9]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY9]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]] + ; CHECK-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc + ; CHECK-NEXT: S_BRANCH %bb.1 bb.0: successors: %bb.3(0x40000000), %bb.1(0x40000000) liveins: $vgpr0 @@ -157,25 +151,25 @@ body: | bb.1: successors: %bb.2(0x80000000) - %4:sreg_64_xexec = PHI %5, %bb.3, %3, %bb.0 - %6:vgpr_32 = PHI %7, %bb.3, %1, %bb.0 + %6:vgpr_32 = PHI %7, %bb.31, %1, %bb.0 S_BRANCH %bb.2 bb.2: successors: %bb.3(0x80000000) - %8:sreg_64_xexec = COPY %4 - SI_END_CF killed %8, implicit-def $exec, implicit-def dead $scc, implicit $exec %9:vgpr_32 = nsw V_ADD_U32_e32 1, killed %6, implicit $exec bb.3: - successors: %bb.3(0x40000000), %bb.1(0x40000000) + successors: %bb.3(0x40000000), %bb.31(0x40000000) %10:vgpr_32 = PHI %9, %bb.2, %7, %bb.3, %1, %bb.0 GLOBAL_STORE_DWORD undef %11:vreg_64, %10, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) %7:vgpr_32 = COPY killed %10 - %5:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %5:sreg_64_xexec = SI_IF %2, %bb.31, implicit-def $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 + bb.31: + SI_WAVE_RECONVERGE %3, implicit-def $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 ... @@ -195,44 +189,41 @@ body: | ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, killed [[COPY]], implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_MOV_B32_e32_]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY3]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY3]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY $exec + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_MOV_B64_term]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 - ; CHECK-NEXT: $exec = S_OR_B64_term $exec, killed [[COPY4]], implicit-def $scc - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 ; CHECK-NEXT: S_NOP 0, implicit killed [[S_MOV_B64_]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]] - ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY5]], implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]] + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY4]], implicit $exec ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] + ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %9:vreg_64, [[COPY5]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_1]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] - ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %11:vreg_64, [[COPY6]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY8]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY8]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]] + ; CHECK-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc + ; CHECK-NEXT: S_BRANCH %bb.1 bb.0: liveins: $vgpr0 @@ -243,20 +234,21 @@ body: | S_BRANCH %bb.2 bb.1: - %4:sreg_64_xexec = PHI %5, %bb.2, %3, %bb.0 - %6:vgpr_32 = PHI %7, %bb.2, %1, %bb.0 + %6:vgpr_32 = PHI %7, %bb.21, %1, %bb.0 %8:sreg_64 = S_MOV_B64 1 - SI_END_CF killed %4, implicit-def $exec, implicit-def dead $scc, implicit $exec S_NOP 0, implicit killed %8 %9:vgpr_32 = nsw V_ADD_U32_e32 1, killed %6, implicit $exec bb.2: - successors: %bb.2(0x40000000), %bb.1(0x40000000) + successors: %bb.2(0x40000000), %bb.21(0x40000000) %10:vgpr_32 = PHI %9, %bb.1, %7, %bb.2, %1, %bb.0 GLOBAL_STORE_DWORD undef %11:vreg_64, %10, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) %7:vgpr_32 = COPY killed %10 - %5:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %5:sreg_64_xexec = SI_IF %2, %bb.21, implicit-def $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.2 + bb.21: + SI_WAVE_RECONVERGE %3, implicit-def $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 ... diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.xfail.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.xfail.mir index f4e26aeae67666..2a5bb2b47d96e6 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.xfail.mir +++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.xfail.mir @@ -1,3 +1,4 @@ +# XFAIL: * # RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -start-before=livevars -stop-after=twoaddressinstruction -verify-machineinstrs -o - %s 2>&1 | FileCheck %s # CHECK: *** Bad machine code: LiveVariables: Block missing from AliveBlocks *** @@ -26,8 +27,8 @@ body: | %4:sreg_64_xexec = PHI %5, %bb.3, %3, %bb.0 %6:vgpr_32 = PHI %7, %bb.3, %1, %bb.0 %8:sreg_64 = S_MOV_B64 1 - SI_END_CF killed %4, implicit-def $exec, implicit-def dead $scc, implicit $exec %9:vgpr_32 = nsw V_ADD_U32_e32 1, killed %6, implicit $exec + SI_WAVE_RECONVERGE killed %4, implicit-def $exec, implicit-def dead $scc, implicit $exec bb.2: S_NOP 0, implicit killed %8 diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir index 914cc8ae8844cb..33870cdc7c8cbd 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir +++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir @@ -21,13 +21,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: S_CMP_LG_U64_term [[V_CMP_EQ_U32_e64_]], 0, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -68,12 +67,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: S_CMP_LG_U64_term [[V_CMP_EQ_U32_e64_]], 0, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -109,15 +108,15 @@ body: | ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_B64_]], implicit-def $scc + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: S_CMP_LG_U64_term [[V_CMP_EQ_U32_e64_]], 0, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -157,9 +156,10 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec - ; CHECK-NEXT: $exec = S_ANDN2_B64_term $exec, [[V_CMP_EQ_U32_e64_]], implicit-def $scc + ; CHECK-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 $exec, [[V_CMP_EQ_U32_e64_]], implicit-def $scc + ; CHECK-NEXT: $exec = S_CSELECT_B64_term [[S_ANDN2_B64_]], [[V_CMP_EQ_U32_e64_]], implicit $scc ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: @@ -209,40 +209,34 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 killed [[COPY]], killed [[COPY1]], implicit $exec - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: S_CMP_LG_U64_term [[V_CMP_EQ_U32_e64_]], 0, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec ; CHECK-NEXT: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]] ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, implicit $exec :: (volatile load (s32), addrspace 1) - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]] + ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] - ; CHECK-NEXT: $exec = S_OR_B64_term $exec, killed [[COPY4]], implicit-def $scc - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] ; CHECK-NEXT: S_SLEEP 1 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY5]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY5]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], $exec, implicit-def $scc + ; CHECK-NEXT: S_CMP_LG_U64_term [[S_AND_B64_]], 0, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec ; CHECK-NEXT: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 bb.0: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 @@ -261,7 +255,6 @@ body: | bb.2: %12:sreg_64_xexec = COPY %14 - SI_END_CF killed %12, implicit-def $exec, implicit-def dead $scc, implicit $exec S_SLEEP 1 %9:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec %14:sreg_64_xexec = S_MOV_B64_term %9, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/lower-i1-copies-clear-kills.mir b/llvm/test/CodeGen/AMDGPU/lower-i1-copies-clear-kills.mir index c5e2ba5d8c7cba..faea7bebdc8fce 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-i1-copies-clear-kills.mir +++ b/llvm/test/CodeGen/AMDGPU/lower-i1-copies-clear-kills.mir @@ -42,13 +42,13 @@ body: | ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[PHI1]], [[PHI2]], implicit $exec ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], $exec_lo, implicit-def $scc + ; CHECK-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_AND_B32_]], %bb.1, [[S_OR_B32_]], %bb.2 ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.1, [[V_OR_B32_e64_]], %bb.2 - ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[PHI3]] @@ -63,6 +63,7 @@ body: | ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI1]], killed [[S_MOV_B32_5]], implicit-def dead $scc ; CHECK-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, -1, implicit-def $scc + ; CHECK-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000) @@ -70,15 +71,12 @@ body: | ; CHECK-NEXT: [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_4]], %bb.3, [[S_XOR_B32_1]], %bb.4 ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[COPY8]], %bb.3, [[PHI4]], %bb.4 ; CHECK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.3, [[S_OR_B32_1]], %bb.4 - ; CHECK-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[PHI5]] ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY9]], [[PHI]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: - ; CHECK-NEXT: [[PHI8:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.5 - ; CHECK-NEXT: SI_END_CF [[PHI8]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x80000000) @@ -114,13 +112,13 @@ body: | %21:vgpr_32 = V_OR_B32_e64 %15, %17, implicit $exec %22:sreg_32 = S_MOV_B32 -1 %23:vreg_1 = COPY %22, implicit $exec + SI_WAVE_RECONVERGE %20, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: successors: %bb.4(0x40000000), %bb.5(0x40000000) %24:vgpr_32 = PHI %17, %bb.1, %21, %bb.2 %25:vreg_1 = PHI %7, %bb.1, %23, %bb.2 - SI_END_CF %20, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %26:sreg_32 = S_MOV_B32 -1 %27:sreg_32 = IMPLICIT_DEF %28:sreg_32 = COPY %25 @@ -136,6 +134,7 @@ body: | %33:sreg_32 = S_OR_B32 %15, killed %32, implicit-def dead $scc %34:sreg_32 = S_MOV_B32 0 %35:vreg_1 = COPY %34, implicit $exec + SI_WAVE_RECONVERGE %31, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: successors: %bb.6(0x04000000), %bb.1(0x7c000000) @@ -143,15 +142,12 @@ body: | %18:vgpr_32 = PHI %29, %bb.3, %24, %bb.4 %16:sreg_32 = PHI %27, %bb.3, %33, %bb.4 %36:vreg_1 = PHI %30, %bb.3, %35, %bb.4 - SI_END_CF %31, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %37:sreg_32 = COPY %36 %14:sreg_32 = SI_IF_BREAK %37, %13, implicit-def dead $scc SI_LOOP %14, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.6 bb.6: - %38:sreg_32 = PHI %14, %bb.5 - SI_END_CF %38, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir index efa21052e3ae2f..27a7c82ab0bde1 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir @@ -38,13 +38,13 @@ body: | ; GFX9-NEXT: [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec ; GFX9-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.2: ; GFX9-NEXT: successors: %bb.3(0x80000000) ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1 ; GFX9-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1 - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.3: ; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]] @@ -74,11 +74,11 @@ body: | bb.1: %20:vgpr_32 = V_ADD_F32_e32 %10, %11, implicit $mode, implicit $exec %21:vgpr_32 = V_ADD_F32_e32 %13, %14, implicit $mode, implicit $exec + SI_WAVE_RECONVERGE %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1 %23:vgpr_32 = PHI %4, %bb.0, %21, %bb.1 - SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: S_ENDPGM 0, implicit %22, implicit %23 @@ -120,13 +120,13 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec ; GFX9-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.2: ; GFX9-NEXT: successors: %bb.3(0x80000000) ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1 ; GFX9-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1 - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.3: ; GFX9-NEXT: [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec @@ -157,11 +157,11 @@ body: | bb.1: %20:vgpr_32 = V_ADD_F32_e32 %10, %11, implicit $mode, implicit $exec %21:vgpr_32 = V_ADD_F32_e32 %13, %14, implicit $mode, implicit $exec + SI_WAVE_RECONVERGE %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1 %23:vgpr_32 = PHI %4, %bb.0, %21, %bb.1 - SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: %24:vgpr_32 = V_ADD_F32_e32 %14, %11, implicit $mode, implicit $exec @@ -205,13 +205,13 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec ; GFX9-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.2: ; GFX9-NEXT: successors: %bb.3(0x80000000) ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1 ; GFX9-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1 - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.3: ; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]] @@ -242,11 +242,11 @@ body: | bb.1: %20:vgpr_32 = V_ADD_F32_e32 %10, %11, implicit $mode, implicit $exec %21:vgpr_32 = V_ADD_F32_e32 %13, %14, implicit $mode, implicit $exec + SI_WAVE_RECONVERGE %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1 %23:vgpr_32 = PHI %4, %bb.0, %21, %bb.1 - SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: S_ENDPGM 0, implicit %22, implicit %23 @@ -279,11 +279,12 @@ body: | ; GFX9-NEXT: bb.1: ; GFX9-NEXT: successors: %bb.2(0x80000000) ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.2: ; GFX9-NEXT: successors: %bb.3(0x80000000) ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]] - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.3: ; GFX9-NEXT: S_ENDPGM 0, implicit %6 @@ -306,10 +307,10 @@ body: | S_BRANCH %bb.1 bb.1: + SI_WAVE_RECONVERGE %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: S_NOP 0, implicit %6 - SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: S_ENDPGM 0, implicit %9 @@ -348,6 +349,7 @@ body: | ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.3: ; GFX9-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) @@ -356,7 +358,6 @@ body: | ; GFX9-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX9-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX9-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]] - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.4: @@ -412,13 +413,13 @@ body: | liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc S_NOP 0 + SI_WAVE_RECONVERGE %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: successors: %bb.4(0x40000000), %bb.6(0x40000000) liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc S_NOP 0, implicit %6, implicit %7 - SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_CBRANCH_EXECZ %bb.6, implicit $exec bb.4: @@ -481,12 +482,12 @@ body: | ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.3: ; GFX9-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.4: @@ -543,12 +544,12 @@ body: | liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc S_NOP 0 + SI_WAVE_RECONVERGE %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: successors: %bb.4(0x40000000), %bb.6(0x40000000) liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc - SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_CBRANCH_EXECZ %bb.6, implicit $exec bb.4: @@ -620,13 +621,13 @@ body: | ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: S_BRANCH %bb.4 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.4: ; GFX9-NEXT: successors: %bb.5(0x40000000), %bb.7(0x40000000) ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; GFX9-NEXT: S_BRANCH %bb.5 ; GFX9-NEXT: {{ $}} @@ -695,13 +696,13 @@ body: | liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc S_NOP 0 + SI_WAVE_RECONVERGE %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.4 bb.4: successors: %bb.5(0x40000000), %bb.7(0x40000000) liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc - SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_CBRANCH_EXECZ %bb.7, implicit $exec S_BRANCH %bb.5 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir index 04c80582f6f079..ea10d5b8ffb9d3 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir @@ -40,7 +40,6 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: @@ -53,6 +52,7 @@ body: | ; CHECK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc ; CHECK-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.6(0x04000000), %bb.2(0x7c000000) @@ -60,7 +60,6 @@ body: | ; CHECK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4 ; CHECK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4 ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4 - ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.6 @@ -69,7 +68,6 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.1(0x7c000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5 - ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.3 @@ -107,7 +105,6 @@ body: | S_BRANCH %bb.4 bb.3: - SI_END_CF %12, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 bb.4: @@ -120,6 +117,7 @@ body: | %49:sreg_32 = S_ANDN2_B32 %45, $exec_lo, implicit-def $scc %50:sreg_32 = S_AND_B32 %30, $exec_lo, implicit-def $scc %46:sreg_32 = S_OR_B32 %49, %50, implicit-def $scc + SI_WAVE_RECONVERGE %4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: successors: %bb.6(0x04000000), %bb.2(0x7c000000) @@ -127,7 +125,6 @@ body: | %10:sreg_32 = PHI %45, %bb.2, %46, %bb.4 %8:sreg_32 = PHI %39, %bb.2, %40, %bb.4 %9:vgpr_32 = PHI %36, %bb.2, %6, %bb.4 - SI_END_CF %4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %11:sreg_32 = SI_IF_BREAK %10, %2, implicit-def dead $scc %12:sreg_32 = SI_IF_BREAK %8, %0, implicit-def dead $scc SI_LOOP %11, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -137,7 +134,6 @@ body: | successors: %bb.3(0x04000000), %bb.1(0x7c000000) %13:vgpr_32 = PHI %9, %bb.5 - SI_END_CF %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec SI_LOOP %12, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 ... diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll index b8e74bc7db09a1..13d07b3d6204bb 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll @@ -10,35 +10,37 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v1, 1, v1 ; CHECK-NEXT: v_and_b32_e32 v3, 1, v3 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: v_cmp_eq_u32_e64 s4, 1, v1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; CHECK-NEXT: s_xor_b32 s6, s4, -1 +; CHECK-NEXT: s_xor_b32 s5, s4, -1 ; CHECK-NEXT: s_inst_prefetch 0x1 ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_1: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: v_add_nc_u32_e32 v4, -4, v4 -; CHECK-NEXT: .LBB0_2: ; %Flow1 -; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: .LBB0_2: ; %for.end121 +; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; j lastloop entry ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_or_b32 s5, s4, s5 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; CHECK-NEXT: s_cbranch_execz .LBB0_8 +; CHECK-NEXT: s_or_b32 s6, s4, s6 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s6 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s6 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_8 ; CHECK-NEXT: .LBB0_3: ; %for.body33 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_6 Depth 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: s_and_saveexec_b32 s7, s6 -; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: s_mov_b32 s7, exec_lo +; CHECK-NEXT: s_and_b32 s4, s5, exec_lo +; CHECK-NEXT: s_cmov_b32 exec_lo, s4 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.4: ; %for.body51.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b32 s8, 0 @@ -47,7 +49,6 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_5: ; %if.end118 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_add_i32 s9, s9, 4 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; backedge @@ -55,24 +56,27 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: v_add_nc_u32_e32 v4, s9, v2 ; CHECK-NEXT: v_cmp_ge_u32_e64 s4, v4, v0 ; CHECK-NEXT: s_or_b32 s8, s4, s8 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; CHECK-NEXT: s_cbranch_execz .LBB0_1 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s8 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s8 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_1 ; CHECK-NEXT: .LBB0_6: ; %for.body51 ; CHECK-NEXT: ; Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v3, 1 -; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo -; CHECK-NEXT: s_cbranch_execz .LBB0_5 +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; CHECK-NEXT: s_cmov_b32 exec_lo, s10 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_5 ; CHECK-NEXT: ; %bb.7: ; %if.then112 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 ; CHECK-NEXT: s_add_i32 s10, s9, 4 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v4, s10 ; CHECK-NEXT: ds_write_b32 v1, v4 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_branch .LBB0_5 ; CHECK-NEXT: .LBB0_8: ; %for.body159.preheader ; CHECK-NEXT: s_inst_prefetch 0x2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_mov_b32 vcc_lo, exec_lo ; CHECK-NEXT: .LBB0_9: ; %for.body159 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir index 037a285794120d..c443299e995b65 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir @@ -50,7 +50,6 @@ body: | ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.4 - ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]] ; CHECK-NEXT: S_BRANCH %bb.2 @@ -96,7 +95,6 @@ body: | bb.5: %7:vgpr_32 = PHI %0, %bb.4 - SI_END_CF %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec INLINEASM &"", 1, implicit %5 S_BRANCH %bb.2 @@ -161,7 +159,6 @@ body: | ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.4 - ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], [[COPY1]], implicit-def dead $scc ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[S_ADD_I32_]] ; CHECK-NEXT: S_BRANCH %bb.2 @@ -207,7 +204,6 @@ body: | bb.5: %7:vgpr_32 = PHI %0, %bb.4 - SI_END_CF %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec INLINEASM &"", 1, implicit %5 S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 8861ee380be031..9804450b0d5f40 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -112,8 +112,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_mov_b32_e32 v42, v0 ; CHECK-NEXT: s_mov_b32 s42, exec_lo -; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42 -; CHECK-NEXT: s_cbranch_execz .LBB0_25 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v42 +; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_26 ; CHECK-NEXT: ; %bb.1: ; %.preheader5 ; CHECK-NEXT: v_mul_lo_u32 v0, v41, 14 ; CHECK-NEXT: s_mov_b32 s4, 0 @@ -125,61 +127,84 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v42 ; CHECK-NEXT: ds_write_b8 v1, v45 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB0_2 +; CHECK-NEXT: s_andn2_b32 s6, exec_lo, s4 +; CHECK-NEXT: s_cselect_b32 exec_lo, s6, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42 -; CHECK-NEXT: s_mov_b32 s43, 0 +; CHECK-NEXT: s_mov_b32 s43, exec_lo +; CHECK-NEXT: s_mov_b32 s48, 0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45 -; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo -; CHECK-NEXT: s_cbranch_execz .LBB0_25 +; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_25 ; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0 ; CHECK-NEXT: v_mov_b32_e32 v47, 0 -; CHECK-NEXT: s_mov_b32 s49, 0 -; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1 -; CHECK-NEXT: ; Child Loop BB0_8 Depth 2 -; CHECK-NEXT: ; Child Loop BB0_20 Depth 2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s49, v44 -; CHECK-NEXT: s_lshl_b32 s4, s49, 5 -; CHECK-NEXT: s_add_i32 s48, s49, 1 -; CHECK-NEXT: s_add_i32 s5, s49, 5 -; CHECK-NEXT: v_or3_b32 v57, s4, v43, s48 +; CHECK-NEXT: s_mov_b32 s52, 0 +; CHECK-NEXT: s_branch .LBB0_7 +; CHECK-NEXT: .LBB0_5: ; %Flow43 +; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1 +; CHECK-NEXT: s_inst_prefetch 0x2 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 +; CHECK-NEXT: .LBB0_6: ; %Flow44 +; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s49, v45 +; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47 +; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 +; CHECK-NEXT: s_mov_b32 s52, s49 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 +; CHECK-NEXT: s_or_b32 s48, s4, s48 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s48 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s48 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_24 +; CHECK-NEXT: .LBB0_7: ; =>This Loop Header: Depth=1 +; CHECK-NEXT: ; Child Loop BB0_10 Depth 2 +; CHECK-NEXT: ; Child Loop BB0_22 Depth 2 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s52, v44 +; CHECK-NEXT: s_add_i32 s5, s52, 5 +; CHECK-NEXT: s_lshl_b32 s4, s52, 5 +; CHECK-NEXT: s_add_i32 s49, s52, 1 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, s5, v42 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_read_u8 v56, v0 -; CHECK-NEXT: v_mov_b32_e32 v58, s48 -; CHECK-NEXT: s_mov_b32 s52, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42 -; CHECK-NEXT: s_cbranch_execz .LBB0_17 -; CHECK-NEXT: ; %bb.6: ; %.preheader2 -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_mov_b32 s53, 0 +; CHECK-NEXT: v_or3_b32 v57, s4, v43, s49 +; CHECK-NEXT: v_mov_b32_e32 v58, s49 +; CHECK-NEXT: s_mov_b32 s53, exec_lo +; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_19 +; CHECK-NEXT: ; %bb.8: ; %.preheader2 +; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1 ; CHECK-NEXT: s_mov_b32 s54, 0 -; CHECK-NEXT: s_branch .LBB0_8 -; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 -; CHECK-NEXT: s_add_i32 s54, s54, 4 -; CHECK-NEXT: s_add_i32 s4, s49, s54 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s54, v57 +; CHECK-NEXT: s_mov_b32 s55, 0 +; CHECK-NEXT: s_branch .LBB0_10 +; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_10 Depth=2 +; CHECK-NEXT: s_add_i32 s55, s55, 4 +; CHECK-NEXT: s_add_i32 s4, s52, s55 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v57 ; CHECK-NEXT: s_add_i32 s5, s4, 5 ; CHECK-NEXT: s_add_i32 s4, s4, 1 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42 ; CHECK-NEXT: v_mov_b32_e32 v58, s4 -; CHECK-NEXT: s_or_b32 s53, vcc_lo, s53 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53 -; CHECK-NEXT: s_cbranch_execz .LBB0_16 -; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1 +; CHECK-NEXT: s_or_b32 s54, vcc_lo, s54 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s54 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s54 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_18 +; CHECK-NEXT: .LBB0_10: ; Parent Loop BB0_7 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_add_nc_u32_e32 v59, s54, v46 -; CHECK-NEXT: v_add_nc_u32_e32 v58, s54, v57 +; CHECK-NEXT: v_add_nc_u32_e32 v59, s55, v46 +; CHECK-NEXT: v_add_nc_u32_e32 v58, s55, v57 +; CHECK-NEXT: s_mov_b32 s56, exec_lo ; CHECK-NEXT: ds_read_u8 v0, v59 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s55, s4 -; CHECK-NEXT: s_cbranch_execz .LBB0_10 -; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: s_cmp_lg_u32 s4, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, s4 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_12 +; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -196,14 +221,16 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v58 -; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56 +; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:1 +; CHECK-NEXT: s_mov_b32 s56, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s55, s4 -; CHECK-NEXT: s_cbranch_execz .LBB0_12 -; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: s_cmp_lg_u32 s4, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, s4 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_14 +; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -221,14 +248,16 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 -; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56 +; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:2 +; CHECK-NEXT: s_mov_b32 s56, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s55, s4 -; CHECK-NEXT: s_cbranch_execz .LBB0_14 -; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: s_cmp_lg_u32 s4, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, s4 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_16 +; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -246,14 +275,16 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 -; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56 +; CHECK-NEXT: .LBB0_16: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:3 +; CHECK-NEXT: s_mov_b32 s56, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s55, s4 -; CHECK-NEXT: s_cbranch_execz .LBB0_7 -; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: s_cmp_lg_u32 s4, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, s4 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_9 +; CHECK-NEXT: ; %bb.17: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -271,40 +302,43 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v58 -; CHECK-NEXT: s_branch .LBB0_7 -; CHECK-NEXT: .LBB0_16: ; %Flow45 -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56 +; CHECK-NEXT: s_branch .LBB0_9 +; CHECK-NEXT: .LBB0_18: ; %Flow45 +; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v57, v0 -; CHECK-NEXT: .LBB0_17: ; %Flow46 -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 -; CHECK-NEXT: s_mov_b32 s49, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42 -; CHECK-NEXT: s_cbranch_execz .LBB0_23 -; CHECK-NEXT: ; %bb.18: ; %.preheader -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_mov_b32 s52, 0 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: .LBB0_19: ; in Loop: Header=BB0_7 Depth=1 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, v58, v42 +; CHECK-NEXT: s_xor_b32 s52, vcc_lo, exec_lo +; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 +; CHECK-NEXT: ; %bb.20: ; %.preheader +; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1 +; CHECK-NEXT: s_mov_b32 s53, 0 ; CHECK-NEXT: s_inst_prefetch 0x1 -; CHECK-NEXT: s_branch .LBB0_20 +; CHECK-NEXT: s_branch .LBB0_22 ; CHECK-NEXT: .p2align 6 -; CHECK-NEXT: .LBB0_19: ; in Loop: Header=BB0_20 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: .LBB0_21: ; in Loop: Header=BB0_22 Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v58, 1, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v58, v42 -; CHECK-NEXT: s_or_b32 s52, vcc_lo, s52 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 -; CHECK-NEXT: s_cbranch_execz .LBB0_22 -; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1 +; CHECK-NEXT: s_or_b32 s53, vcc_lo, s53 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s53 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s53 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_5 +; CHECK-NEXT: .LBB0_22: ; Parent Loop BB0_7 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58 +; CHECK-NEXT: s_mov_b32 s54, exec_lo ; CHECK-NEXT: ds_read_u8 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s53, s4 -; CHECK-NEXT: s_cbranch_execz .LBB0_19 -; CHECK-NEXT: ; %bb.21: ; in Loop: Header=BB0_20 Depth=2 +; CHECK-NEXT: s_cmp_lg_u32 s4, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, s4 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_21 +; CHECK-NEXT: ; %bb.23: ; in Loop: Header=BB0_22 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -321,26 +355,13 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v57 -; CHECK-NEXT: s_branch .LBB0_19 -; CHECK-NEXT: .LBB0_22: ; %Flow43 -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_inst_prefetch 0x2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 -; CHECK-NEXT: .LBB0_23: ; %Flow44 -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49 -; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s48, v45 -; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47 -; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 -; CHECK-NEXT: s_mov_b32 s49, s48 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_or_b32 s43, s4, s43 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s43 -; CHECK-NEXT: s_cbranch_execnz .LBB0_5 -; CHECK-NEXT: .LBB0_25: ; %Flow51 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s54 +; CHECK-NEXT: s_branch .LBB0_21 +; CHECK-NEXT: .LBB0_24: ; %Flow47 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s43 +; CHECK-NEXT: .LBB0_25: ; %Flow49 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s42 +; CHECK-NEXT: .LBB0_26: ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -355,16 +376,19 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_addc_u32 s7, s7, _Z7barrierj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b32 s4, exec_lo ; CHECK-NEXT: ds_read_b32 v47, v0 offset:15360 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v41 -; CHECK-NEXT: s_cbranch_execz .LBB0_33 -; CHECK-NEXT: ; %bb.26: +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, v47, v41 +; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_35 +; CHECK-NEXT: ; %bb.27: ; CHECK-NEXT: s_mov_b32 s42, 0 -; CHECK-NEXT: s_branch .LBB0_28 -; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1 +; CHECK-NEXT: s_branch .LBB0_30 +; CHECK-NEXT: .LBB0_28: ; %Flow40 +; CHECK-NEXT: ; in Loop: Header=BB0_30 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s43 +; CHECK-NEXT: .LBB0_29: ; in Loop: Header=BB0_30 Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -381,9 +405,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41 ; CHECK-NEXT: s_or_b32 s42, vcc_lo, s42 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s42 -; CHECK-NEXT: s_cbranch_execz .LBB0_33 -; CHECK-NEXT: .LBB0_28: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s42 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s42 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_35 +; CHECK-NEXT: .LBB0_30: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v41 ; CHECK-NEXT: s_mov_b32 s43, exec_lo ; CHECK-NEXT: ds_read_b32 v0, v0 @@ -410,9 +435,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_xor_b32_e32 v56, v10, v6 ; CHECK-NEXT: v_or_b32_e32 v5, v46, v57 ; CHECK-NEXT: v_or_b32_e32 v4, v45, v56 -; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_27 -; CHECK-NEXT: ; %bb.29: ; in Loop: Header=BB0_28 Depth=1 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_29 +; CHECK-NEXT: ; %bb.31: ; in Loop: Header=BB0_30 Depth=1 ; CHECK-NEXT: s_clause 0x1 ; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:24 ; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:24 @@ -448,11 +475,12 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v1, v43 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_bfe_u32 v0, v0, v74, 4 -; CHECK-NEXT: s_mov_b32 s4, exec_lo -; CHECK-NEXT: v_cmpx_gt_u32_e32 12, v0 -; CHECK-NEXT: s_xor_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execz .LBB0_31 -; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1 +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 +; CHECK-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_33 +; CHECK-NEXT: ; %bb.32: ; in Loop: Header=BB0_30 Depth=1 ; CHECK-NEXT: v_xor_b32_e32 v4, v60, v58 ; CHECK-NEXT: v_lshrrev_b64 v[2:3], 16, v[56:57] ; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[46:47] @@ -475,11 +503,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: global_store_dword v[6:7], v8, off offset:4 ; CHECK-NEXT: global_store_dwordx4 v[6:7], v[0:3], off offset:8 ; CHECK-NEXT: global_store_dwordx2 v[6:7], v[4:5], off offset:24 -; CHECK-NEXT: .LBB0_31: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB0_28 Depth=1 -; CHECK-NEXT: s_andn2_saveexec_b32 s4, s4 -; CHECK-NEXT: s_cbranch_execz .LBB0_27 -; CHECK-NEXT: ; %bb.32: ; in Loop: Header=BB0_28 Depth=1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: .LBB0_33: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB0_30 Depth=1 +; CHECK-NEXT: s_xor_b32 s48, s4, exec_lo +; CHECK-NEXT: s_cmp_lg_u32 s4, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, s4 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_28 +; CHECK-NEXT: ; %bb.34: ; in Loop: Header=BB0_30 Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, v42 ; CHECK-NEXT: v_mov_b32_e32 v1, v43 @@ -495,8 +526,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_subPU3AS1Vjj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_subPU3AS1Vjj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_branch .LBB0_27 -; CHECK-NEXT: .LBB0_33: +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48 +; CHECK-NEXT: s_branch .LBB0_28 +; CHECK-NEXT: .LBB0_35: ; CHECK-NEXT: s_endpgm %6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4 %7 = trunc i64 %6 to i32 @@ -851,27 +883,45 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41 -; CHECK-NEXT: .LBB1_1: ; %.37 +; CHECK-NEXT: s_branch .LBB1_3 +; CHECK-NEXT: .LBB1_1: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; CHECK-NEXT: s_inst_prefetch 0x2 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44 +; CHECK-NEXT: .LBB1_2: ; %.32 +; CHECK-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s43, v45 +; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 +; CHECK-NEXT: s_or_b32 s42, s4, s42 +; CHECK-NEXT: s_mov_b32 s4, s43 +; CHECK-NEXT: s_andn2_b32 s5, exec_lo, s42 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s42 +; CHECK-NEXT: s_cbranch_scc0 .LBB1_12 +; CHECK-NEXT: .LBB1_3: ; %.37 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 -; CHECK-NEXT: ; Child Loop BB1_3 Depth 2 -; CHECK-NEXT: ; Child Loop BB1_8 Depth 2 +; CHECK-NEXT: ; Child Loop BB1_5 Depth 2 +; CHECK-NEXT: ; Child Loop BB1_10 Depth 2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44 -; CHECK-NEXT: s_lshl_b32 s5, s4, 5 +; CHECK-NEXT: s_add_i32 s7, s4, 5 +; CHECK-NEXT: s_lshl_b32 s6, s4, 5 ; CHECK-NEXT: s_add_i32 s43, s4, 1 -; CHECK-NEXT: s_add_i32 s6, s4, 5 -; CHECK-NEXT: v_or3_b32 v47, s5, v42, s43 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, s7, v41 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_read_u8 v46, v0 +; CHECK-NEXT: v_or3_b32 v47, s6, v42, s43 ; CHECK-NEXT: v_mov_b32_e32 v56, s43 ; CHECK-NEXT: s_mov_b32 s5, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41 -; CHECK-NEXT: s_cbranch_execz .LBB1_5 -; CHECK-NEXT: ; %bb.2: ; %.53.preheader -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB1_7 +; CHECK-NEXT: ; %bb.4: ; %.53.preheader +; CHECK-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .LBB1_3: ; %.53 -; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 +; CHECK-NEXT: .LBB1_5: ; %.53 +; CHECK-NEXT: ; Parent Loop BB1_3 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: s_add_i32 s7, s7, 4 ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 @@ -882,44 +932,48 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41 ; CHECK-NEXT: v_mov_b32_e32 v56, s8 ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 -; CHECK-NEXT: s_cbranch_execnz .LBB1_3 -; CHECK-NEXT: ; %bb.4: ; %Flow3 -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_andn2_b32 s8, exec_lo, s6 +; CHECK-NEXT: s_cselect_b32 exec_lo, s8, s6 +; CHECK-NEXT: s_cbranch_scc1 .LBB1_5 +; CHECK-NEXT: ; %bb.6: ; %Flow3 +; CHECK-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v47, v0 -; CHECK-NEXT: .LBB1_5: ; %Flow4 -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; CHECK-NEXT: s_mov_b32 s44, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41 -; CHECK-NEXT: s_cbranch_execz .LBB1_11 -; CHECK-NEXT: ; %bb.6: ; %.103.preheader -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: .LBB1_7: ; %.48 +; CHECK-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, v56, v41 +; CHECK-NEXT: s_xor_b32 s44, vcc_lo, exec_lo +; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB1_2 +; CHECK-NEXT: ; %bb.8: ; %.103.preheader +; CHECK-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; CHECK-NEXT: s_mov_b32 s45, 0 ; CHECK-NEXT: s_inst_prefetch 0x1 -; CHECK-NEXT: s_branch .LBB1_8 +; CHECK-NEXT: s_branch .LBB1_10 ; CHECK-NEXT: .p2align 6 -; CHECK-NEXT: .LBB1_7: ; %.114 -; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s46 +; CHECK-NEXT: .LBB1_9: ; %.114 +; CHECK-NEXT: ; in Loop: Header=BB1_10 Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41 ; CHECK-NEXT: s_or_b32 s45, vcc_lo, s45 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s45 -; CHECK-NEXT: s_cbranch_execz .LBB1_10 -; CHECK-NEXT: .LBB1_8: ; %.103 -; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s45 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s45 +; CHECK-NEXT: s_cbranch_scc0 .LBB1_1 +; CHECK-NEXT: .LBB1_10: ; %.103 +; CHECK-NEXT: ; Parent Loop BB1_3 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56 +; CHECK-NEXT: s_mov_b32 s46, exec_lo ; CHECK-NEXT: ds_read_u8 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s46, s4 -; CHECK-NEXT: s_cbranch_execz .LBB1_7 -; CHECK-NEXT: ; %bb.9: ; %.110 -; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 +; CHECK-NEXT: s_cmp_lg_u32 s4, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, s4 +; CHECK-NEXT: s_cbranch_scc0 .LBB1_9 +; CHECK-NEXT: ; %bb.11: ; %.110 +; CHECK-NEXT: ; in Loop: Header=BB1_10 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s36, 40 @@ -936,26 +990,9 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v47 -; CHECK-NEXT: s_branch .LBB1_7 -; CHECK-NEXT: .LBB1_10: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_inst_prefetch 0x2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s45 -; CHECK-NEXT: .LBB1_11: ; %Flow2 -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44 -; CHECK-NEXT: ; %bb.12: ; %.32 -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s43, v45 -; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_or_b32 s42, s4, s42 -; CHECK-NEXT: s_mov_b32 s4, s43 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s42 -; CHECK-NEXT: s_cbranch_execnz .LBB1_1 -; CHECK-NEXT: ; %bb.13: ; %.119 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s42 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s46 +; CHECK-NEXT: s_branch .LBB1_9 +; CHECK-NEXT: .LBB1_12: ; %.119 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_add_u32 s8, s36, 40 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir index 329f2967121603..c4c3878a7e98bf 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir @@ -30,7 +30,6 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) ; CHECK-NEXT: SI_RETURN bb.0: @@ -57,7 +56,6 @@ body: | S_BRANCH %bb.2 bb.2: - SI_END_CF %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec FLAT_STORE_DWORD %3, %9, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) SI_RETURN ... @@ -93,7 +91,6 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) ; CHECK-NEXT: SI_RETURN bb.0: @@ -122,7 +119,6 @@ body: | S_BRANCH %bb.2 bb.2: - SI_END_CF %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec FLAT_STORE_DWORD %3, %11, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index 1dd18b4228fe5e..f77b7c011da173 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -106,24 +106,30 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: s_branch .LBB0_12 ; CHECK-NEXT: .LBB0_10: ; %Flow19 ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: .LBB0_11: ; %Flow21 ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 ; CHECK-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; CHECK-NEXT: s_cbranch_vccz .LBB0_20 +; CHECK-NEXT: s_cbranch_vccz .LBB0_21 ; CHECK-NEXT: .LBB0_12: ; %while.cond ; CHECK-NEXT: ; =>This Loop Header: Depth=1 -; CHECK-NEXT: ; Child Loop BB0_14 Depth 2 -; CHECK-NEXT: ; Child Loop BB0_18 Depth 2 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_15 -; CHECK-NEXT: ; %bb.13: ; %loop-memcpy-expansion2.preheader +; CHECK-NEXT: ; Child Loop BB0_15 Depth 2 +; CHECK-NEXT: ; Child Loop BB0_19 Depth 2 +; CHECK-NEXT: s_mov_b64 s[8:9], exec +; CHECK-NEXT: s_and_b64 s[10:11], s[4:5], exec +; CHECK-NEXT: s_cmov_b64 exec, s[10:11] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_14 +; CHECK-NEXT: ; %bb.13: ; %Flow20 +; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 +; CHECK-NEXT: s_mov_b64 s[8:9], -1 +; CHECK-NEXT: s_cbranch_execz .LBB0_11 +; CHECK-NEXT: s_branch .LBB0_17 +; CHECK-NEXT: .LBB0_14: ; %loop-memcpy-expansion2.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 ; CHECK-NEXT: s_mov_b64 s[10:11], 0 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 ; CHECK-NEXT: s_mov_b64 s[14:15], 0 -; CHECK-NEXT: .LBB0_14: ; %loop-memcpy-expansion2 +; CHECK-NEXT: .LBB0_15: ; %loop-memcpy-expansion2 ; CHECK-NEXT: ; Parent Loop BB0_12 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v10, s10 @@ -152,6 +158,7 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc, s[14:15], v[4:5] ; CHECK-NEXT: s_addc_u32 s11, s11, 0 ; CHECK-NEXT: s_or_b64 s[12:13], vcc, s[12:13] +; CHECK-NEXT: s_andn2_b64 s[16:17], exec, s[12:13] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_store_byte v[10:11], v15 offset:3 ; CHECK-NEXT: flat_store_byte v[10:11], v16 offset:2 @@ -169,23 +176,25 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: flat_store_byte v[10:11], v21 offset:14 ; CHECK-NEXT: flat_store_byte v[10:11], v20 offset:13 ; CHECK-NEXT: flat_store_byte v[10:11], v27 offset:12 -; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] -; CHECK-NEXT: s_cbranch_execnz .LBB0_14 -; CHECK-NEXT: .LBB0_15: ; %Flow20 +; CHECK-NEXT: s_cselect_b64 exec, s[16:17], s[12:13] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_15 +; CHECK-NEXT: ; %bb.16: ; %loop.exit.guard ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], -1 ; CHECK-NEXT: s_cbranch_execz .LBB0_11 -; CHECK-NEXT: ; %bb.16: ; %loop-memcpy-residual-header5 +; CHECK-NEXT: .LBB0_17: ; %loop-memcpy-residual-header5 ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] -; CHECK-NEXT: s_xor_b64 s[10:11], exec, s[8:9] -; CHECK-NEXT: s_cbranch_execz .LBB0_10 -; CHECK-NEXT: ; %bb.17: ; %loop-memcpy-residual4.preheader +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], exec +; CHECK-NEXT: s_xor_b64 s[10:11], s[8:9], exec +; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 +; CHECK-NEXT: s_cmov_b64 exec, s[8:9] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_10 +; CHECK-NEXT: ; %bb.18: ; %loop-memcpy-residual4.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 ; CHECK-NEXT: s_mov_b64 s[14:15], 0 -; CHECK-NEXT: .LBB0_18: ; %loop-memcpy-residual4 +; CHECK-NEXT: .LBB0_19: ; %loop-memcpy-residual4 ; CHECK-NEXT: ; Parent Loop BB0_12 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v12, s15 @@ -198,15 +207,16 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: v_cmp_ge_u64_e64 s[8:9], s[14:15], v[6:7] ; CHECK-NEXT: v_addc_co_u32_e32 v11, vcc, v9, v12, vcc ; CHECK-NEXT: s_or_b64 s[12:13], s[8:9], s[12:13] +; CHECK-NEXT: s_andn2_b64 s[8:9], exec, s[12:13] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_store_byte v[10:11], v13 -; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] -; CHECK-NEXT: s_cbranch_execnz .LBB0_18 -; CHECK-NEXT: ; %bb.19: ; %Flow +; CHECK-NEXT: s_cselect_b64 exec, s[8:9], s[12:13] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_19 +; CHECK-NEXT: ; %bb.20: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[12:13] +; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] ; CHECK-NEXT: s_branch .LBB0_10 -; CHECK-NEXT: .LBB0_20: ; %DummyReturnBlock +; CHECK-NEXT: .LBB0_21: ; %DummyReturnBlock ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/mmra.ll b/llvm/test/CodeGen/AMDGPU/mmra.ll index 0167fcbc4ab7c0..aaa39335db61fd 100644 --- a/llvm/test/CodeGen/AMDGPU/mmra.ll +++ b/llvm/test/CodeGen/AMDGPU/mmra.ll @@ -91,8 +91,6 @@ define void @atomicrmw_rel(ptr %ptr) { ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: - ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 - ; CHECK-NEXT: SI_END_CF [[PHI2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: SI_RETURN %old.2 = atomicrmw add ptr %ptr, i8 0 release, !mmra !1 ret void @@ -159,22 +157,20 @@ define void @cmpxchg(ptr %ptr) { ; CHECK-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[S_OR_B64_]], $exec, implicit-def $scc ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc ; CHECK-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_ANDN2_B64_]], [[S_AND_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.Flow: ; CHECK-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[S_OR_B64_]], %bb.1, [[S_OR_B64_1]], %bb.2 ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[COPY7]], %bb.1, [[V_AND_B32_e64_3]], %bb.2 - ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[PHI3]] ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK [[COPY8]], [[PHI1]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4.partword.cmpxchg.end: - ; CHECK-NEXT: [[PHI5:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.3 - ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[FLAT_ATOMIC_CMPSWAP_RTN]], %bb.3 - ; CHECK-NEXT: SI_END_CF [[PHI5]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[FLAT_ATOMIC_CMPSWAP_RTN]], %bb.3 ; CHECK-NEXT: SI_RETURN %pair = cmpxchg ptr %ptr, i8 0, i8 1 acquire acquire, !mmra !2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll index eb638da3904055..245817e7d16dc0 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll @@ -23,34 +23,34 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_4 +; GCN-NEXT: s_and_b64 s[0:1], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400 +; GCN-NEXT: buffer_load_dword v5, v[1:2], s[8:11], 0 addr64 offset:400 ; GCN-NEXT: s_load_dword s2, s[2:3], 0xf ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_max_i32_e32 v3, s2, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v4 -; GCN-NEXT: v_mov_b32_e32 v5, v3 -; GCN-NEXT: buffer_atomic_cmpswap v[5:6], v[1:2], s[8:11], 0 addr64 offset:400 glc +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_max_i32_e32 v4, s2, v5 +; GCN-NEXT: v_mov_b32_e32 v3, v4 +; GCN-NEXT: v_mov_b32_e32 v4, v5 +; GCN-NEXT: buffer_atomic_cmpswap v[3:4], v[1:2], s[8:11], 0 addr64 offset:400 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GCN-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, v5 -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB0_2 +; GCN-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v5, v3 +; GCN-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.3: ; %atomicrmw.end -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: buffer_store_dword v5, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v3, off, s[4:7], 0 ; GCN-NEXT: .LBB0_4: ; %exit ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -82,8 +82,9 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[4:7], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_3 +; GCN-NEXT: s_and_b64 s[0:1], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_3 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 @@ -102,9 +103,10 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GCN-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v4, v5 -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB1_2 +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: .LBB1_3: ; %exit ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll index 90a3d350e7416e..6ceec35b6f6327 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll @@ -23,8 +23,9 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: s_and_b64 s[0:1], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_load_dword s0, s[2:3], 0xf ; GCN-NEXT: s_mov_b32 s8, s10 @@ -67,8 +68,9 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[4:7], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: s_and_b64 s[0:1], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_load_dword s0, s[2:3], 0xf ; GCN-NEXT: s_mov_b32 s4, s6 diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll index 003c3ea7fce104..1289632f9d6e5b 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll @@ -60,11 +60,10 @@ define amdgpu_ps float @vimage_move_to_valu(<8 x i32> %rsrc) { ; GFX11-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[IMAGE_LOAD_V1_V2_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 [[REG_SEQUENCE1]], killed [[REG_SEQUENCE6]], 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX11-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_MOV_B32_1]], %bb.1, implicit $scc ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.3: - ; GFX11-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_1]] ; GFX11-NEXT: $vgpr0 = COPY [[IMAGE_LOAD_V1_V2_gfx11_]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -122,11 +121,10 @@ define amdgpu_ps float @vimage_move_to_valu(<8 x i32> %rsrc) { ; GFX12-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[IMAGE_LOAD_V1_V2_gfx12_:%[0-9]+]]:vgpr_32 = IMAGE_LOAD_V1_V2_gfx12 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE5]], 1, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_MOV_B32_]], %bb.1, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.3: - ; GFX12-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; GFX12-NEXT: $vgpr0 = COPY [[IMAGE_LOAD_V1_V2_gfx12_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 bb: @@ -194,11 +192,10 @@ define amdgpu_ps float @vsample_move_to_valu_rsrc(<8 x i32> %rsrc, <4 x i32> inr ; GFX11-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[IMAGE_SAMPLE_V1_V1_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V1_gfx11 [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE6]], [[REG_SEQUENCE1]], 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX11-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_MOV_B32_]], %bb.1, implicit $scc ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.3: - ; GFX11-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; GFX11-NEXT: $vgpr0 = COPY [[IMAGE_SAMPLE_V1_V1_gfx11_]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -261,11 +258,10 @@ define amdgpu_ps float @vsample_move_to_valu_rsrc(<8 x i32> %rsrc, <4 x i32> inr ; GFX12-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[IMAGE_SAMPLE_V1_V1_gfx12_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V1_gfx12 [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE6]], [[REG_SEQUENCE1]], 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_MOV_B32_]], %bb.1, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.3: - ; GFX12-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; GFX12-NEXT: $vgpr0 = COPY [[IMAGE_SAMPLE_V1_V1_gfx12_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 main_body: @@ -319,11 +315,10 @@ define amdgpu_ps float @vsample_move_to_valu_samp(<8 x i32> inreg %rsrc, <4 x i3 ; GFX11-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[IMAGE_SAMPLE_V1_V1_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V1_gfx11 [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], killed [[REG_SEQUENCE4]], 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX11-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_MOV_B32_]], %bb.1, implicit $scc ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.3: - ; GFX11-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; GFX11-NEXT: $vgpr0 = COPY [[IMAGE_SAMPLE_V1_V1_gfx11_]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -372,11 +367,10 @@ define amdgpu_ps float @vsample_move_to_valu_samp(<8 x i32> inreg %rsrc, <4 x i3 ; GFX12-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[IMAGE_SAMPLE_V1_V1_gfx12_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V1_gfx12 [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], killed [[REG_SEQUENCE4]], 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_MOV_B32_]], %bb.1, implicit $scc ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.3: - ; GFX12-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; GFX12-NEXT: $vgpr0 = COPY [[IMAGE_SAMPLE_V1_V1_gfx12_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 main_body: diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll index 06ebd86b795107..93295ef093f024 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll @@ -25,12 +25,12 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9_W64-NEXT: ; implicit-def: $vgpr4 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9_W64-NEXT: ; %bb.2: -; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) ; GFX9_W64-NEXT: v_mov_b32_e32 v0, v5 ; GFX9_W64-NEXT: s_setpc_b64 s[30:31] @@ -49,13 +49,13 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1010_W32-NEXT: ; %bb.2: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W32-NEXT: v_mov_b32_e32 v0, v5 ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31] @@ -74,13 +74,13 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1010_W64-NEXT: ; %bb.2: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W64-NEXT: v_mov_b32_e32 v0, v5 ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31] @@ -101,12 +101,13 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1100_W32-NEXT: ; %bb.2: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W32-NEXT: v_mov_b32_e32 v0, v5 ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31] @@ -127,12 +128,13 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1100_W64-NEXT: ; %bb.2: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W64-NEXT: v_mov_b32_e32 v0, v5 ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31] @@ -210,33 +212,31 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 2 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: v_readlane_b32 s12, v1, 0 +; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s12 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB0_1 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB0_1 ; W64-O0-NEXT: ; %bb.3: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 2 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: ; kill: killed $vgpr1 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 @@ -271,11 +271,11 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9_W64-NEXT: ; %bb.2: -; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX9_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v4 @@ -288,12 +288,12 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9_W64-NEXT: ; implicit-def: $vgpr8 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX9_W64-NEXT: ; %bb.4: -; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX9_W64-NEXT: s_waitcnt vmcnt(1) ; GFX9_W64-NEXT: global_store_dword v[9:10], v13, off ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) @@ -315,12 +315,12 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1010_W32-NEXT: ; %bb.2: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 ; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo ; GFX1010_W32-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v4 @@ -332,13 +332,13 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB1_3 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX1010_W32-NEXT: ; %bb.4: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(1) ; GFX1010_W32-NEXT: global_store_dword v[9:10], v13, off ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -361,12 +361,12 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1010_W64-NEXT: ; %bb.2: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX1010_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v4 @@ -378,13 +378,13 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr8 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB1_3 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX1010_W64-NEXT: ; %bb.4: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX1010_W64-NEXT: s_waitcnt vmcnt(1) ; GFX1010_W64-NEXT: global_store_dword v[9:10], v13, off ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -409,12 +409,12 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1100_W32-NEXT: ; %bb.2: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1100_W32-NEXT: ; %bb.2: ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v4 @@ -428,12 +428,13 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB1_3 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX1100_W32-NEXT: ; %bb.4: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(1) ; GFX1100_W32-NEXT: global_store_b32 v[9:10], v13, off dlc ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -458,12 +459,12 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1100_W64-NEXT: ; %bb.2: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1100_W64-NEXT: ; %bb.2: ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v4 @@ -477,12 +478,13 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB1_3 +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX1100_W64-NEXT: ; %bb.4: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: s_waitcnt vmcnt(1) ; GFX1100_W64-NEXT: global_store_b32 v[9:10], v13, off dlc ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -602,34 +604,33 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB1_1 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 2 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: v_readlane_b32 s12, v1, 0 +; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s12 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB1_1 ; W64-O0-NEXT: ; %bb.3: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 2 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_mov_b64 s[4:5], exec +; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_writelane_b32 v0, s4, 9 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 10 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 @@ -670,33 +671,31 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.5: ; in Loop: Header=BB1_4 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s6, v1, 9 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 10 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 15 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 16 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 11 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 12 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: v_readlane_b32 s12, v1, 0 +; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s12 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB1_4 ; W64-O0-NEXT: ; %bb.6: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 9 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 10 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -746,16 +745,18 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9_W64-NEXT: ; implicit-def: $vgpr8 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[6:7] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[6:7], s[12:13] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_W64-NEXT: ; %bb.2: -; GFX9_W64-NEXT: s_mov_b64 exec, s[12:13] ; GFX9_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX9_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX9_W64-NEXT: s_cbranch_execz .LBB2_6 +; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec +; GFX9_W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9_W64-NEXT: s_cmov_b64 exec, vcc +; GFX9_W64-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX9_W64-NEXT: ; %bb.3: ; %bb1 ; GFX9_W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX9_W64-NEXT: s_mov_b64 s[12:13], exec @@ -770,14 +771,14 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_4 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX9_W64-NEXT: ; %bb.5: -; GFX9_W64-NEXT: s_mov_b64 exec, s[12:13] -; GFX9_W64-NEXT: .LBB2_6: ; %bb2 ; GFX9_W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9_W64-NEXT: .LBB2_6: ; %bb2 ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) ; GFX9_W64-NEXT: global_store_dword v[11:12], v9, off ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) @@ -801,17 +802,19 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W32-NEXT: s_and_b32 s5, vcc_lo, s5 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, s5 ; GFX1010_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s5 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s5, s6 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1010_W32-NEXT: ; %bb.2: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s6 ; GFX1010_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo ; GFX1010_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1010_W32-NEXT: s_cbranch_execz .LBB2_6 +; GFX1010_W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1010_W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1010_W32-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1010_W32-NEXT: ; %bb.3: ; %bb1 ; GFX1010_W32-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010_W32-NEXT: s_mov_b32 s6, exec_lo @@ -825,15 +828,15 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_4 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX1010_W32-NEXT: ; %bb.5: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s6 -; GFX1010_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1010_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W32-NEXT: global_store_dword v[11:12], v9, off ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -857,17 +860,19 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX1010_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr8 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[6:7] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[6:7], s[12:13] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1010_W64-NEXT: ; %bb.2: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[12:13] ; GFX1010_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX1010_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1010_W64-NEXT: s_cbranch_execz .LBB2_6 +; GFX1010_W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1010_W64-NEXT: s_cmov_b64 exec, vcc +; GFX1010_W64-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1010_W64-NEXT: ; %bb.3: ; %bb1 ; GFX1010_W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010_W64-NEXT: s_mov_b64 s[12:13], exec @@ -881,15 +886,15 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_4 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX1010_W64-NEXT: ; %bb.5: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[12:13] -; GFX1010_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1010_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W64-NEXT: global_store_dword v[11:12], v9, off ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -915,17 +920,20 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1100_W32-NEXT: ; %bb.2: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100_W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1100_W32-NEXT: s_cbranch_execz .LBB2_6 +; GFX1100_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1100_W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1100_W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1100_W32-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1100_W32-NEXT: ; %bb.3: ; %bb1 ; GFX1100_W32-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100_W32-NEXT: s_mov_b32 s2, exec_lo @@ -941,15 +949,15 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_4 -; GFX1100_W32-NEXT: ; %bb.5: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX1100_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1100_W32-NEXT: ; %bb.5: ; GFX1100_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1100_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W32-NEXT: global_store_b32 v[11:12], v9, off dlc ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -975,17 +983,20 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1100_W64-NEXT: ; %bb.2: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100_W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1100_W64-NEXT: s_cbranch_execz .LBB2_6 +; GFX1100_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1100_W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1100_W64-NEXT: s_cmov_b64 exec, vcc +; GFX1100_W64-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1100_W64-NEXT: ; %bb.3: ; %bb1 ; GFX1100_W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100_W64-NEXT: s_mov_b64 s[8:9], exec @@ -1001,15 +1012,15 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_4 -; GFX1100_W64-NEXT: ; %bb.5: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[8:9] -; GFX1100_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1100_W64-NEXT: ; %bb.5: ; GFX1100_W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1100_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W64-NEXT: global_store_b32 v[11:12], v9, off dlc ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1122,51 +1133,51 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB2_1 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s6, v1, 2 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 3 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 8 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 9 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 4 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 5 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 6 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: v_readlane_b32 s12, v1, 1 +; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s12 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB2_1 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB2_1 ; W64-O0-NEXT: ; %bb.3: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s6, v0, 2 -; W64-O0-NEXT: v_readlane_b32 s7, v0, 3 -; W64-O0-NEXT: s_mov_b64 exec, s[6:7] -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_and_b32_e64 v2, v2, s5 -; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 +; W64-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, s4 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 10 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 11 +; W64-O0-NEXT: s_mov_b64 s[6:7], exec +; W64-O0-NEXT: v_writelane_b32 v0, s6, 10 +; W64-O0-NEXT: v_writelane_b32 v0, s7, 11 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execz .LBB2_8 -; W64-O0-NEXT: ; %bb.4: ; %bb1 +; W64-O0-NEXT: s_cmp_lg_u64 s[4:5], 0 +; W64-O0-NEXT: s_cmov_b64 exec, s[4:5] +; W64-O0-NEXT: s_cbranch_scc1 .LBB2_4 +; W64-O0-NEXT: s_branch .LBB2_8 +; W64-O0-NEXT: .LBB2_4: ; %bb1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] @@ -1218,44 +1229,41 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.6: ; in Loop: Header=BB2_5 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s6, v1, 13 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 14 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 19 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 20 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 15 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 16 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 17 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 18 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 12 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: v_readlane_b32 s12, v1, 12 +; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s12 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB2_5 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB2_5 ; W64-O0-NEXT: ; %bb.7: +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 14 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s4, v1, 10 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 11 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: .LBB2_8: ; %bb2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 10 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 11 -; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index 2591ff4bd2538a..5295204a23f7ec 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -24,12 +24,12 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9_W64-NEXT: ; implicit-def: $vgpr4 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9_W64-NEXT: ; %bb.2: -; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) ; GFX9_W64-NEXT: v_mov_b32_e32 v0, v5 ; GFX9_W64-NEXT: s_setpc_b64 s[30:31] @@ -48,13 +48,13 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1010_W32-NEXT: ; %bb.2: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W32-NEXT: v_mov_b32_e32 v0, v5 ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31] @@ -73,13 +73,13 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1010_W64-NEXT: ; %bb.2: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W64-NEXT: v_mov_b32_e32 v0, v5 ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31] @@ -100,12 +100,13 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1100_W32-NEXT: ; %bb.2: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W32-NEXT: v_mov_b32_e32 v0, v5 ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31] @@ -126,12 +127,13 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1100_W64-NEXT: ; %bb.2: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W64-NEXT: v_mov_b32_e32 v0, v5 ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31] @@ -224,33 +226,31 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 2 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: v_readlane_b32 s12, v1, 0 +; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s12 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB0_1 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB0_1 ; W64-O0-NEXT: ; %bb.3: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 2 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0-NEXT: ; kill: killed $vgpr1 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 @@ -285,11 +285,11 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9_W64-NEXT: ; %bb.2: -; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX9_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v4 @@ -302,12 +302,12 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9_W64-NEXT: ; implicit-def: $vgpr8 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX9_W64-NEXT: ; %bb.4: -; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX9_W64-NEXT: s_waitcnt vmcnt(1) ; GFX9_W64-NEXT: global_store_dword v[9:10], v13, off ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) @@ -329,12 +329,12 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1010_W32-NEXT: ; %bb.2: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 ; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo ; GFX1010_W32-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v4 @@ -346,13 +346,13 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB1_3 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX1010_W32-NEXT: ; %bb.4: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(1) ; GFX1010_W32-NEXT: global_store_dword v[9:10], v13, off ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -375,12 +375,12 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1010_W64-NEXT: ; %bb.2: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX1010_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v4 @@ -392,13 +392,13 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr8 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB1_3 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX1010_W64-NEXT: ; %bb.4: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX1010_W64-NEXT: s_waitcnt vmcnt(1) ; GFX1010_W64-NEXT: global_store_dword v[9:10], v13, off ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -423,12 +423,12 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1100_W32-NEXT: ; %bb.2: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1100_W32-NEXT: ; %bb.2: ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v4 @@ -442,12 +442,13 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB1_3 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX1100_W32-NEXT: ; %bb.4: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(1) ; GFX1100_W32-NEXT: global_store_b32 v[9:10], v13, off dlc ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -472,12 +473,12 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1100_W64-NEXT: ; %bb.2: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1100_W64-NEXT: ; %bb.2: ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v4 @@ -491,12 +492,13 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB1_3 +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX1100_W64-NEXT: ; %bb.4: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: s_waitcnt vmcnt(1) ; GFX1100_W64-NEXT: global_store_b32 v[9:10], v13, off dlc ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -641,34 +643,33 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB1_1 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 2 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: v_readlane_b32 s12, v1, 0 +; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s12 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB1_1 ; W64-O0-NEXT: ; %bb.3: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 2 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_mov_b64 s[4:5], exec +; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_writelane_b32 v0, s4, 9 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 10 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 @@ -709,33 +710,31 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.5: ; in Loop: Header=BB1_4 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s6, v1, 9 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 10 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 15 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 16 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 11 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 12 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: v_readlane_b32 s12, v1, 0 +; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s12 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB1_4 ; W64-O0-NEXT: ; %bb.6: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 9 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 10 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -785,16 +784,18 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9_W64-NEXT: ; implicit-def: $vgpr8 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[6:7] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[6:7], s[12:13] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_W64-NEXT: ; %bb.2: -; GFX9_W64-NEXT: s_mov_b64 exec, s[12:13] ; GFX9_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX9_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX9_W64-NEXT: s_cbranch_execz .LBB2_6 +; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec +; GFX9_W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9_W64-NEXT: s_cmov_b64 exec, vcc +; GFX9_W64-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX9_W64-NEXT: ; %bb.3: ; %bb1 ; GFX9_W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX9_W64-NEXT: s_mov_b64 s[12:13], exec @@ -809,14 +810,14 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_4 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX9_W64-NEXT: ; %bb.5: -; GFX9_W64-NEXT: s_mov_b64 exec, s[12:13] -; GFX9_W64-NEXT: .LBB2_6: ; %bb2 ; GFX9_W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9_W64-NEXT: .LBB2_6: ; %bb2 ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) ; GFX9_W64-NEXT: global_store_dword v[11:12], v9, off ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) @@ -840,17 +841,19 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W32-NEXT: s_and_b32 s5, vcc_lo, s5 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, s5 ; GFX1010_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s5 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s5, s6 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1010_W32-NEXT: ; %bb.2: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s6 ; GFX1010_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo ; GFX1010_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1010_W32-NEXT: s_cbranch_execz .LBB2_6 +; GFX1010_W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1010_W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1010_W32-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1010_W32-NEXT: ; %bb.3: ; %bb1 ; GFX1010_W32-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010_W32-NEXT: s_mov_b32 s6, exec_lo @@ -864,15 +867,15 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_4 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s4, s6 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX1010_W32-NEXT: ; %bb.5: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s6 -; GFX1010_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1010_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W32-NEXT: global_store_dword v[11:12], v9, off ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -896,17 +899,19 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX1010_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr8 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[6:7] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[6:7], s[12:13] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1010_W64-NEXT: ; %bb.2: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[12:13] ; GFX1010_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX1010_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1010_W64-NEXT: s_cbranch_execz .LBB2_6 +; GFX1010_W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1010_W64-NEXT: s_cmov_b64 exec, vcc +; GFX1010_W64-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1010_W64-NEXT: ; %bb.3: ; %bb1 ; GFX1010_W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010_W64-NEXT: s_mov_b64 s[12:13], exec @@ -920,15 +925,15 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_4 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX1010_W64-NEXT: ; %bb.5: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[12:13] -; GFX1010_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1010_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W64-NEXT: global_store_dword v[11:12], v9, off ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -954,17 +959,20 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1100_W32-NEXT: ; %bb.2: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100_W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1100_W32-NEXT: s_cbranch_execz .LBB2_6 +; GFX1100_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1100_W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1100_W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1100_W32-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1100_W32-NEXT: ; %bb.3: ; %bb1 ; GFX1100_W32-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100_W32-NEXT: s_mov_b32 s2, exec_lo @@ -980,15 +988,15 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_4 -; GFX1100_W32-NEXT: ; %bb.5: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX1100_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1100_W32-NEXT: ; %bb.5: ; GFX1100_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1100_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W32-NEXT: global_store_b32 v[11:12], v9, off dlc ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1014,17 +1022,20 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1100_W64-NEXT: ; %bb.2: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100_W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1100_W64-NEXT: s_cbranch_execz .LBB2_6 +; GFX1100_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1100_W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1100_W64-NEXT: s_cmov_b64 exec, vcc +; GFX1100_W64-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1100_W64-NEXT: ; %bb.3: ; %bb1 ; GFX1100_W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100_W64-NEXT: s_mov_b64 s[8:9], exec @@ -1040,15 +1051,15 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_4 -; GFX1100_W64-NEXT: ; %bb.5: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[8:9] -; GFX1100_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1100_W64-NEXT: ; %bb.5: ; GFX1100_W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1100_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W64-NEXT: global_store_b32 v[11:12], v9, off dlc ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1178,60 +1189,60 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB2_1 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s6, v1, 2 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 3 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 8 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 9 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 4 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 5 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 6 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: v_readlane_b32 s12, v1, 1 +; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s12 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB2_1 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB2_1 ; W64-O0-NEXT: ; %bb.3: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s6, v0, 2 -; W64-O0-NEXT: v_readlane_b32 s7, v0, 3 -; W64-O0-NEXT: s_mov_b64 exec, s[6:7] -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_and_b32_e64 v2, v2, s5 -; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 +; W64-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, s4 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 10 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 11 +; W64-O0-NEXT: s_mov_b64 s[6:7], exec +; W64-O0-NEXT: v_writelane_b32 v0, s6, 10 +; W64-O0-NEXT: v_writelane_b32 v0, s7, 11 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execz .LBB2_8 -; W64-O0-NEXT: ; %bb.4: ; %bb1 +; W64-O0-NEXT: s_cmp_lg_u64 s[4:5], 0 +; W64-O0-NEXT: s_cmov_b64 exec, s[4:5] +; W64-O0-NEXT: s_cbranch_scc1 .LBB2_4 +; W64-O0-NEXT: s_branch .LBB2_8 +; W64-O0-NEXT: .LBB2_4: ; %bb1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 0 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(4) +; W64-O0-NEXT: v_readlane_b32 s4, v0, 0 ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_mov_b32_e32 v7, v5 ; W64-O0-NEXT: v_mov_b32_e32 v1, v4 @@ -1295,44 +1306,41 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.6: ; in Loop: Header=BB2_5 Depth=1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s6, v1, 13 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 14 ; W64-O0-NEXT: v_readlane_b32 s4, v1, 19 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 20 ; W64-O0-NEXT: v_readlane_b32 s8, v1, 15 ; W64-O0-NEXT: v_readlane_b32 s9, v1, 16 ; W64-O0-NEXT: v_readlane_b32 s10, v1, 17 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 18 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 12 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: v_readlane_b32 s12, v1, 12 +; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s12 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB2_5 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB2_5 ; W64-O0-NEXT: ; %bb.7: +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 14 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s4, v1, 10 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 11 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: .LBB2_8: ; %bb2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 10 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 11 -; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir index c702de6285d9b1..1c1a1af94d7264 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -62,15 +62,13 @@ body: | ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W64-NEXT: {{ $}} ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W64-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; W64-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; W64-NEXT: {{ $}} ; W64-NEXT: .3: - ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_IDXEN]] ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; W32-LABEL: name: idxen ; W32: successors: %bb.1(0x80000000) ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 @@ -107,11 +105,10 @@ body: | ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W32-NEXT: {{ $}} ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; W32-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_MOV_B32_]], %bb.1, implicit $scc ; W32-NEXT: {{ $}} ; W32-NEXT: .3: - ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_IDXEN]] ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -177,15 +174,13 @@ body: | ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W64-NEXT: {{ $}} ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W64-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; W64-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; W64-NEXT: {{ $}} ; W64-NEXT: .3: - ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; W32-LABEL: name: offen ; W32: successors: %bb.1(0x80000000) ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 @@ -222,11 +217,10 @@ body: | ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W32-NEXT: {{ $}} ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; W32-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_MOV_B32_]], %bb.1, implicit $scc ; W32-NEXT: {{ $}} ; W32-NEXT: .3: - ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -292,15 +286,13 @@ body: | ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W64-NEXT: {{ $}} ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W64-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; W64-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; W64-NEXT: {{ $}} ; W64-NEXT: .3: - ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; W32-LABEL: name: bothen ; W32: successors: %bb.1(0x80000000) ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 @@ -337,11 +329,10 @@ body: | ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W32-NEXT: {{ $}} ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; W32-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_MOV_B32_]], %bb.1, implicit $scc ; W32-NEXT: {{ $}} ; W32-NEXT: .3: - ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -397,7 +388,6 @@ body: | ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] ; ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; W32-LABEL: name: addr64 ; W32: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; W32-NEXT: {{ $}} @@ -475,7 +465,6 @@ body: | ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] ; ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; W64-NO-ADDR64-LABEL: name: offset ; W64-NO-ADDR64: successors: %bb.1(0x80000000) ; W64-NO-ADDR64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 @@ -512,15 +501,13 @@ body: | ; W64-NO-ADDR64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W64-NO-ADDR64-NEXT: {{ $}} ; W64-NO-ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W64-NO-ADDR64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; W64-NO-ADDR64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W64-NO-ADDR64-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; W64-NO-ADDR64-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_MOV_B64_]], %bb.1, implicit $scc ; W64-NO-ADDR64-NEXT: {{ $}} ; W64-NO-ADDR64-NEXT: .3: - ; W64-NO-ADDR64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; W64-NO-ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W64-NO-ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFSET]] ; W64-NO-ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; W32-LABEL: name: offset ; W32: successors: %bb.1(0x80000000) ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 @@ -557,11 +544,10 @@ body: | ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W32-NEXT: {{ $}} ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; W32-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_MOV_B32_]], %bb.1, implicit $scc ; W32-NEXT: {{ $}} ; W32-NEXT: .3: - ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFSET]] ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 1e9994dd8e6efd..30ccbbd0b7f996 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -24,10 +24,10 @@ define void @lsr_order_mul24_0(i32 %arg, i32 %arg2, i32 %arg6, i32 %arg13, i32 % ; GFX9-NEXT: v_add_u32_e32 v5, v5, v0 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9-NEXT: ; %bb.2: ; %.loopexit -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] bb: @@ -55,10 +55,12 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, 1, v18 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX9-NEXT: ; %bb.1: ; %bb19 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6 ; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 @@ -94,14 +96,16 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: global_load_dword v3, v[18:19], off ; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1 ; GFX9-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] ; GFX9-NEXT: ds_write_b32 v6, v3 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX9-NEXT: s_cbranch_execnz .LBB1_2 -; GFX9-NEXT: .LBB1_3: ; %Flow2 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[10:11] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX9-NEXT: ; %bb.3: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: .LBB1_4: ; %Flow2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll index 727b607e7ded06..3563658061a059 100644 --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -24,22 +24,20 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) { ; OPT-NEXT: [[TMP3]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ undef, [[LOOP]] ] ; OPT-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] ; OPT-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]) ; OPT-NEXT: [[TMP6]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP5]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP7:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP6]]) ; OPT-NEXT: [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN2]]) ; OPT-NEXT: br i1 [[TMP7]], label [[FLOW1]], label [[LOOP]] ; OPT: Flow1: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]]) ; OPT-NEXT: [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]]) ; OPT-NEXT: br i1 [[TMP9]], label [[IF:%.*]], label [[LOOP_OUTER]] ; OPT: IF: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) ; OPT-NEXT: ret void ; OPT: ENDIF: ; OPT-NEXT: [[TMP47]] = add i32 [[TMP45]], 1 ; OPT-NEXT: [[TMP51]] = icmp eq i32 [[TMP47]], [[CONT:%.*]] ; OPT-NEXT: [[TMP51_INV]] = xor i1 [[TMP51]], true +; OPT-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP2]]) ; OPT-NEXT: br label [[FLOW]] ; ; GCN-LABEL: multi_else_break: @@ -158,7 +156,6 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { ; OPT-NEXT: [[TMP10]] = phi i1 [ [[CMP1]], [[CASE0]] ], [ [[TMP7]], [[LEAFBLOCK]] ] ; OPT-NEXT: br label [[FLOW4]] ; OPT: bb9: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP4]]) ; OPT-NEXT: ret void ; ; GCN-LABEL: multi_if_break_loop: diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll index d62f045674acec..e01d6857cb79ff 100644 --- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll @@ -16,17 +16,19 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX10-NEXT: .p2align 6 ; GFX10-NEXT: .LBB0_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_and_b32 s0, exec_lo, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_or_b32 s1, s0, s1 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 -; GFX10-NEXT: s_cbranch_execz .LBB0_4 +; GFX10-NEXT: s_andn2_b32 s0, exec_lo, s1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX10-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX10-NEXT: .LBB0_2: ; %bb ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_or_b32 s2, s2, exec_lo -; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB0_1 +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX10-NEXT: ; %bb.3: ; %branch2_merge ; GFX10-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX10-NEXT: s_mov_b32 s5, s4 @@ -47,6 +49,7 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX10-NEXT: v_cmp_le_f32_e64 s0, 0, v1 ; GFX10-NEXT: s_and_b32 s0, s0, exec_lo ; GFX10-NEXT: s_or_b32 s2, s2, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: s_branch .LBB0_1 ; GFX10-NEXT: .LBB0_4: ; %loop0_merge ; GFX10-NEXT: s_inst_prefetch 0x2 @@ -63,18 +66,21 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX12-NEXT: s_branch .LBB0_2 ; GFX12-NEXT: .LBB0_1: ; %Flow ; GFX12-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: s_and_b32 s0, exec_lo, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: s_or_b32 s1, s0, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execz .LBB0_4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX12-NEXT: .LBB0_2: ; %bb ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_or_b32 s2, s2, exec_lo -; GFX12-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12-NEXT: s_cbranch_execz .LBB0_1 +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cmov_b32 exec_lo, s0 +; GFX12-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX12-NEXT: ; %bb.3: ; %branch2_merge ; GFX12-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX12-NEXT: s_mov_b32 s5, s4 @@ -97,6 +103,7 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX12-NEXT: s_and_b32 s0, s0, exec_lo ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b32 s2, s2, s0 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12-NEXT: s_branch .LBB0_1 ; GFX12-NEXT: .LBB0_4: ; %loop0_merge ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index b84686139d0e2c..ff8e9427f8e949 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -219,71 +219,83 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; MUBUF-LABEL: func_non_entry_block_static_alloca_align4: ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_mov_b32 s7, s33 -; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; MUBUF-NEXT: s_mov_b32 s9, s33 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc -; MUBUF-NEXT: s_cbranch_execz .LBB2_3 +; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; MUBUF-NEXT: s_mov_b64 s[4:5], exec +; MUBUF-NEXT: s_cmp_lg_u64 vcc, 0 +; MUBUF-NEXT: s_cmov_b64 exec, vcc +; MUBUF-NEXT: s_cbranch_scc0 .LBB2_4 ; MUBUF-NEXT: ; %bb.1: ; %bb.0 ; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; MUBUF-NEXT: s_and_b64 exec, exec, vcc -; MUBUF-NEXT: s_cbranch_execz .LBB2_3 +; MUBUF-NEXT: s_mov_b64 s[6:7], exec +; MUBUF-NEXT: s_cmp_lg_u64 vcc, 0 +; MUBUF-NEXT: s_cmov_b64 exec, vcc +; MUBUF-NEXT: s_cbranch_scc0 .LBB2_3 ; MUBUF-NEXT: ; %bb.2: ; %bb.1 -; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: s_add_i32 s8, s32, 0x1000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 -; MUBUF-NEXT: v_mov_b32_e32 v3, s6 +; MUBUF-NEXT: v_mov_b32_e32 v3, s8 ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; MUBUF-NEXT: v_mov_b32_e32 v2, 1 ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6 +; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s8 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 -; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: s_mov_b32 s32, s8 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 ; MUBUF-NEXT: global_store_dword v[0:1], v2, off -; MUBUF-NEXT: .LBB2_3: ; %bb.2 +; MUBUF-NEXT: s_or_b64 exec, exec, s[6:7] +; MUBUF-NEXT: .LBB2_3: ; %Flow ; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] +; MUBUF-NEXT: .LBB2_4: ; %bb.2 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 -; MUBUF-NEXT: s_mov_b32 s33, s7 +; MUBUF-NEXT: s_mov_b32 s33, s9 ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4: ; FLATSCR: ; %bb.0: ; %entry ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 s3, s33 -; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; FLATSCR-NEXT: s_mov_b32 s5, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc -; FLATSCR-NEXT: s_cbranch_execz .LBB2_3 +; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; FLATSCR-NEXT: s_mov_b64 s[0:1], exec +; FLATSCR-NEXT: s_cmp_lg_u64 vcc, 0 +; FLATSCR-NEXT: s_cmov_b64 exec, vcc +; FLATSCR-NEXT: s_cbranch_scc0 .LBB2_4 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; FLATSCR-NEXT: s_and_b64 exec, exec, vcc -; FLATSCR-NEXT: s_cbranch_execz .LBB2_3 +; FLATSCR-NEXT: s_mov_b64 s[2:3], exec +; FLATSCR-NEXT: s_cmp_lg_u64 vcc, 0 +; FLATSCR-NEXT: s_cmov_b64 exec, vcc +; FLATSCR-NEXT: s_cbranch_scc0 .LBB2_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 -; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 +; FLATSCR-NEXT: s_add_i32 s4, s32, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 1 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 -; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s4 +; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 -; FLATSCR-NEXT: s_mov_b32 s32, s2 +; FLATSCR-NEXT: s_mov_b32 s32, s4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off -; FLATSCR-NEXT: .LBB2_3: ; %bb.2 +; FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] +; FLATSCR-NEXT: .LBB2_3: ; %Flow ; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; FLATSCR-NEXT: .LBB2_4: ; %bb.2 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 -; FLATSCR-NEXT: s_mov_b32 s33, s3 +; FLATSCR-NEXT: s_mov_b32 s33, s5 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: @@ -318,11 +330,13 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MUBUF-NEXT: s_mov_b32 s7, s33 ; MUBUF-NEXT: s_add_i32 s33, s32, 0xfc0 -; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfffff000 ; MUBUF-NEXT: s_addk_i32 s32, 0x2000 -; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc -; MUBUF-NEXT: s_cbranch_execz .LBB3_2 +; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; MUBUF-NEXT: s_mov_b64 s[4:5], exec +; MUBUF-NEXT: s_cmp_lg_u64 vcc, 0 +; MUBUF-NEXT: s_cmov_b64 exec, vcc +; MUBUF-NEXT: s_cbranch_scc0 .LBB3_2 ; MUBUF-NEXT: ; %bb.1: ; %bb.0 ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 ; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 @@ -338,8 +352,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 ; MUBUF-NEXT: global_store_dword v[0:1], v2, off -; MUBUF-NEXT: .LBB3_2: ; %bb.1 ; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] +; MUBUF-NEXT: .LBB3_2: ; %bb.1 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -352,11 +366,13 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s3, s33 ; FLATSCR-NEXT: s_add_i32 s33, s32, 63 -; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; FLATSCR-NEXT: s_andn2_b32 s33, s33, 63 ; FLATSCR-NEXT: s_addk_i32 s32, 0x80 -; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc -; FLATSCR-NEXT: s_cbranch_execz .LBB3_2 +; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; FLATSCR-NEXT: s_mov_b64 s[0:1], exec +; FLATSCR-NEXT: s_cmp_lg_u64 vcc, 0 +; FLATSCR-NEXT: s_cmov_b64 exec, vcc +; FLATSCR-NEXT: s_cbranch_scc0 .LBB3_2 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 @@ -370,8 +386,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off -; FLATSCR-NEXT: .LBB3_2: ; %bb.1 ; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; FLATSCR-NEXT: .LBB3_2: ; %bb.1 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir index 748775dc2cf1d5..2c20af85c6966a 100644 --- a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir @@ -115,9 +115,9 @@ body: | ; GCN-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub1_sub2_sub3_sub4_sub5, killed [[REG_SEQUENCE3]], %subreg.sub1_sub2_sub3_sub4_sub5_sub6 ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 killed [[V_MOV_B32_e32_1]], [[V_LSHL_B64_e64_]], killed [[REG_SEQUENCE4]], 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2.bb2: - ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_ENDPGM 0 bb.0.bb: successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) @@ -158,9 +158,9 @@ body: | %29 = V_MOV_B32_e32 0, implicit $exec %30 = COPY %24 BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, implicit $exec + SI_WAVE_RECONVERGE %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2.bb2: - SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -257,9 +257,9 @@ body: | ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_LSHL_B64_]] ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 killed [[V_MOV_B32_e32_2]], killed [[COPY3]], killed [[REG_SEQUENCE5]], 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2.bb2: - ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_ENDPGM 0 bb.0.bb: successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) @@ -308,9 +308,9 @@ body: | %38 = V_MOV_B32_e32 0, implicit $exec %39 = COPY %33 BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, implicit $exec + SI_WAVE_RECONVERGE %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2.bb2: - SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -388,9 +388,9 @@ body: | ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub1_sub2_sub3_sub4_sub5, killed [[REG_SEQUENCE2]], %subreg.sub1_sub2_sub3_sub4_sub5_sub6 ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 killed [[V_MOV_B32_e32_1]], [[V_LSHL_B64_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2.bb2: - ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_ENDPGM 0 bb.0.bb: successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) @@ -431,9 +431,9 @@ body: | %29 = V_MOV_B32_e32 0, implicit $exec %30 = COPY %24 BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, implicit $exec + SI_WAVE_RECONVERGE %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2.bb2: - SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll index 5b0354e63c2365..f453fc7e82c2e3 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -223,12 +223,15 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s8, v2, 56 ; GCN-NEXT: v_readlane_b32 s9, v2, 57 ; GCN-NEXT: v_readlane_b32 s10, v2, 58 @@ -293,7 +296,6 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: v_readlane_b32 s73, v2, 5 ; GCN-NEXT: v_readlane_b32 s74, v2, 6 ; GCN-NEXT: v_readlane_b32 s75, v2, 7 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s76, v1, 56 ; GCN-NEXT: v_readlane_b32 s77, v1, 57 ; GCN-NEXT: v_readlane_b32 s78, v1, 58 @@ -318,9 +320,6 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: v_readlane_b32 s5, v1, 5 ; GCN-NEXT: v_readlane_b32 s6, v1, 6 ; GCN-NEXT: v_readlane_b32 s7, v1, 7 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND @@ -379,7 +378,6 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-NEXT: v_readlane_b32 s1, v0, 1 ; GCN-NEXT: v_readlane_b32 s2, v0, 2 @@ -597,12 +595,12 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[28:29] -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s16, v1, 8 ; GCN-NEXT: v_readlane_b32 s17, v1, 9 ; GCN-NEXT: v_readlane_b32 s20, v1, 0 @@ -613,7 +611,6 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: v_readlane_b32 s25, v1, 5 ; GCN-NEXT: v_readlane_b32 s26, v1, 6 ; GCN-NEXT: v_readlane_b32 s27, v1, 7 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s36, v0, 32 ; GCN-NEXT: v_readlane_b32 s37, v0, 33 ; GCN-NEXT: v_readlane_b32 s38, v0, 34 @@ -857,6 +854,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_cbranch_scc1 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -908,9 +908,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: v_readlane_b32 s29, v1, 13 ; GCN-NEXT: v_readlane_b32 s30, v1, 14 ; GCN-NEXT: v_readlane_b32 s31, v1, 15 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[16:31] ; GCN-NEXT: ;;#ASMEND @@ -933,7 +930,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: v_readlane_b32 s17, v1, 61 ; GCN-NEXT: v_readlane_b32 s18, v1, 62 ; GCN-NEXT: v_readlane_b32 s19, v1, 63 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-NEXT: v_readlane_b32 s1, v0, 1 ; GCN-NEXT: ;;#ASMSTART @@ -1109,6 +1105,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -1160,9 +1159,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: v_readlane_b32 s29, v2, 13 ; GCN-NEXT: v_readlane_b32 s30, v2, 14 ; GCN-NEXT: v_readlane_b32 s31, v2, 15 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v0 ; GCN-NEXT: ;;#ASMEND @@ -1188,7 +1184,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: v_readlane_b32 s17, v2, 61 ; GCN-NEXT: v_readlane_b32 s18, v2, 62 ; GCN-NEXT: v_readlane_b32 s19, v2, 63 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s0, v1, 0 ; GCN-NEXT: v_readlane_b32 s1, v1, 1 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir index 8b009978055ac6..cb629d79ea364d 100644 --- a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir +++ b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir @@ -35,13 +35,12 @@ body: | bb.2: successors: %bb.3(0x80000000) - %24:sreg_64 = PHI %20, %bb.3, %22, %bb.0 - %23:vgpr_32 = PHI %19, %bb.3, %18, %bb.0 - SI_END_CF %24, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + %23:vgpr_32 = PHI %19, %bb.4, %18, %bb.0 %3:vgpr_32, dead %10:sreg_64 = nsw V_ADD_CO_U32_e64 1, %23, 0, implicit $exec + SI_WAVE_RECONVERGE %22, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - successors: %bb.3(0x40000000), %bb.2(0x40000000) + successors: %bb.3(0x40000000), %bb.4(0x40000000) %4:vgpr_32 = PHI %19, %bb.3, %3, %bb.2, %18, %bb.0 %15:sreg_32_xm0 = S_MOV_B32 61440 @@ -49,7 +48,10 @@ body: | %17:sgpr_128 = REG_SEQUENCE undef %14:sreg_32_xm0, %subreg.sub0, undef %12:sreg_32_xm0, %subreg.sub1, %16, %subreg.sub2, %15, %subreg.sub3 BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) %19:vgpr_32 = COPY %4 - %20:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + %20:sreg_64 = SI_IF %0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 + bb.4: + SI_WAVE_RECONVERGE %20, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.2 ... diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index ad1f790457de97..26e0b369a821aa 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -2579,8 +2579,9 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_and_b64 s[2:3], exec, vcc ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX8-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %end ; GFX8-NEXT: s_endpgm ; @@ -2598,8 +2599,9 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_and_b64 s[2:3], exec, vcc ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB8_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %end ; GFX9-NEXT: s_endpgm ; @@ -2618,8 +2620,9 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_and_b32 s1, exec_lo, vcc_lo ; GFX10-NEXT: s_or_b32 s0, s1, s0 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX10-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %end ; GFX10-NEXT: s_endpgm ; @@ -2638,8 +2641,10 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s1, s0 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %end ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index fe093d4ac8515e..94835fb88784ba 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -25,7 +25,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v6, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v7, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v21, v20 +; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_cndmask_b32_e32 v22, v5, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v23, v4, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v11, vcc @@ -73,6 +73,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v13, v7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_mov_b32_e32 v21, v20 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc @@ -83,13 +84,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc ; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[4:5] +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, 0, s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB0_6 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB0_6 ; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 1, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v7, vcc @@ -107,21 +109,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v8, v10, v12 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v13 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 -; GFX9-NEXT: v_lshlrev_b64 v[12:13], v13, v[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v13, v[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v13, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, v8, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[4:5] -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_5 ; GFX9-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-NEXT: v_sub_u32_e32 v10, 64, v24 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] @@ -184,16 +187,16 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v19, v25, v27 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v19, v9 ; GFX9-NEXT: v_or3_b32 v7, v7, 0, v11 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v18, v8 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB0_3 +; GFX9-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX9-NEXT: ; %bb.4: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB0_5: ; %Flow2 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB0_5: ; %Flow2 ; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[12:13] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v13 @@ -201,8 +204,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or3_b32 v12, v6, v12, v10 ; GFX9-NEXT: v_or_b32_e32 v10, v9, v15 ; GFX9-NEXT: v_or_b32_e32 v13, v8, v14 -; GFX9-NEXT: .LBB0_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: .LBB0_6: ; %udiv-end ; GFX9-NEXT: v_mul_lo_u32 v16, v13, v5 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0 ; GFX9-NEXT: v_mov_b32_e32 v15, 0 @@ -244,8 +247,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane @@ -574,32 +577,25 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-O0-NEXT: s_mov_b64 s[22:23], exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-O0-NEXT: s_branch .LBB0_8 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB0_7 +; GFX9-O0-NEXT: s_branch .LBB0_2 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 7 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: ; %bb.2: ; %Flow ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload @@ -608,7 +604,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill @@ -621,15 +622,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_5 -; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB0_4 +; GFX9-O0-NEXT: .LBB0_2: ; %Flow2 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -641,8 +636,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_9 -; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit +; GFX9-O0-NEXT: s_branch .LBB0_8 +; GFX9-O0-NEXT: .LBB0_3: ; %udiv-loop-exit ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload @@ -651,13 +646,17 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] -; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5 +; GFX9-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s6, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s6, v[9:10] +; GFX9-O0-NEXT: s_mov_b32 s6, 63 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s6, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 @@ -681,15 +680,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_3 -; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB0_2 +; GFX9-O0-NEXT: .LBB0_4: ; %Flow1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -711,15 +704,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_4 -; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while +; GFX9-O0-NEXT: s_branch .LBB0_3 +; GFX9-O0-NEXT: .LBB0_5: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 10 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 11 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload @@ -736,6 +723,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload @@ -744,8 +734,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v30 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 @@ -772,7 +764,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v30 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 @@ -782,7 +773,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 @@ -880,7 +870,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -901,12 +891,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 7 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 10 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 11 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] @@ -934,10 +921,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 +; GFX9-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_1 -; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader +; GFX9-O0-NEXT: .LBB0_6: ; %udiv-preheader ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload @@ -1040,8 +1028,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 10 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 11 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] @@ -1069,8 +1057,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_6 -; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_branch .LBB0_5 +; GFX9-O0-NEXT: .LBB0_7: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] @@ -1195,18 +1183,17 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 9 +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 -; GFX9-O0-NEXT: s_branch .LBB0_7 -; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end +; GFX9-O0-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB0_6 +; GFX9-O0-NEXT: s_branch .LBB0_4 +; GFX9-O0-NEXT: .LBB0_8: ; %udiv-end ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] @@ -1499,8 +1486,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -1553,6 +1542,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v11, vcc ; GFX9-NEXT: v_subbrev_co_u32_e32 v11, vcc, 0, v11, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc @@ -1566,13 +1556,14 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc ; GFX9-NEXT: v_cndmask_b32_e64 v15, v3, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v13, v1, 0, s[4:5] +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX9-NEXT: v_cndmask_b32_e64 v12, v0, 0, s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_6 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB1_6 ; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 1, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v9, vcc @@ -1591,20 +1582,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v11, v11, v13 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15 -; GFX9-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, exec ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-NEXT: v_sub_u32_e32 v14, 64, v22 ; GFX9-NEXT: v_lshrrev_b64 v[12:13], v22, v[0:1] @@ -1661,22 +1653,22 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc ; GFX9-NEXT: v_or_b32_e32 v11, v21, v11 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v20, v22, v24 ; GFX9-NEXT: v_or_b32_e32 v21, v23, v25 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v30 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v21, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v20, v12 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX9-NEXT: ; %bb.4: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB1_5: ; %Flow2 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB1_5: ; %Flow2 ; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[10:11] ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 31, v11 @@ -1684,8 +1676,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or3_b32 v14, v8, v10, v14 ; GFX9-NEXT: v_or_b32_e32 v13, v13, v17 ; GFX9-NEXT: v_or_b32_e32 v12, v12, v16 -; GFX9-NEXT: .LBB1_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: .LBB1_6: ; %udiv-end ; GFX9-NEXT: v_mul_lo_u32 v19, v12, v7 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v12, 0 ; GFX9-NEXT: v_mov_b32_e32 v17, 0 @@ -1719,8 +1711,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane @@ -1964,32 +1956,25 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 3 +; GFX9-O0-NEXT: s_mov_b64 s[18:19], exec +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB1_3 -; GFX9-O0-NEXT: s_branch .LBB1_8 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_7 +; GFX9-O0-NEXT: s_branch .LBB1_2 ; GFX9-O0-NEXT: .LBB1_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: ; %bb.2: ; %Flow ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -1998,7 +1983,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 5 ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill @@ -2011,15 +2001,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_5 -; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB1_4 +; GFX9-O0-NEXT: .LBB1_2: ; %Flow2 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2031,8 +2015,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_9 -; GFX9-O0-NEXT: .LBB1_4: ; %udiv-loop-exit +; GFX9-O0-NEXT: s_branch .LBB1_8 +; GFX9-O0-NEXT: .LBB1_3: ; %udiv-loop-exit ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload @@ -2041,13 +2025,17 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] -; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 3 +; GFX9-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s6, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s6, v[9:10] +; GFX9-O0-NEXT: s_mov_b32 s6, 63 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s6, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 @@ -2071,15 +2059,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_3 -; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB1_2 +; GFX9-O0-NEXT: .LBB1_4: ; %Flow1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -2101,15 +2083,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_4 -; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while +; GFX9-O0-NEXT: s_branch .LBB1_3 +; GFX9-O0-NEXT: .LBB1_5: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload @@ -2126,6 +2102,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload @@ -2134,8 +2113,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 6 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 7 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v30 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 @@ -2162,7 +2143,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v30 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 @@ -2172,7 +2152,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 @@ -2270,7 +2249,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -2291,12 +2270,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -2324,10 +2300,11 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execnz .LBB1_6 +; GFX9-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_1 -; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader +; GFX9-O0-NEXT: .LBB1_6: ; %udiv-preheader ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload @@ -2430,8 +2407,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -2459,8 +2436,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_6 -; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_branch .LBB1_5 +; GFX9-O0-NEXT: .LBB1_7: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -2585,18 +2562,17 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 -; GFX9-O0-NEXT: s_branch .LBB1_7 -; GFX9-O0-NEXT: .LBB1_9: ; %udiv-end +; GFX9-O0-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_6 +; GFX9-O0-NEXT: s_branch .LBB1_4 +; GFX9-O0-NEXT: .LBB1_8: ; %udiv-end ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -2853,8 +2829,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll index ebc209bd4d4510..b5ec4fe77c45e1 100644 --- a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll +++ b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll @@ -132,9 +132,10 @@ define amdgpu_ps void @test_if_export_f32(i32 %flag, float %x, float %y, float % ; GCN-LABEL: test_if_export_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_setprio 2 -; GCN-NEXT: s_mov_b32 s0, exec_lo -; GCN-NEXT: v_cmpx_ne_u32_e32 0, v0 -; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GCN-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB9_2 ; GCN-NEXT: ; %bb.1: ; %exp ; GCN-NEXT: exp mrt0 v1, v2, v3, v4 ; GCN-NEXT: s_setprio 0 @@ -159,9 +160,10 @@ define amdgpu_ps void @test_if_export_vm_f32(i32 %flag, float %x, float %y, floa ; GCN-LABEL: test_if_export_vm_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_setprio 2 -; GCN-NEXT: s_mov_b32 s0, exec_lo -; GCN-NEXT: v_cmpx_ne_u32_e32 0, v0 -; GCN-NEXT: s_cbranch_execz .LBB10_2 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GCN-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB10_2 ; GCN-NEXT: ; %bb.1: ; %exp ; GCN-NEXT: exp mrt0 v1, v2, v3, v4 ; GCN-NEXT: s_setprio 0 @@ -186,9 +188,10 @@ define amdgpu_ps void @test_if_export_done_f32(i32 %flag, float %x, float %y, fl ; GCN-LABEL: test_if_export_done_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_setprio 2 -; GCN-NEXT: s_mov_b32 s0, exec_lo -; GCN-NEXT: v_cmpx_ne_u32_e32 0, v0 -; GCN-NEXT: s_cbranch_execz .LBB11_2 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GCN-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB11_2 ; GCN-NEXT: ; %bb.1: ; %exp ; GCN-NEXT: exp mrt0 v1, v2, v3, v4 done ; GCN-NEXT: s_setprio 0 @@ -213,9 +216,10 @@ define amdgpu_ps void @test_if_export_vm_done_f32(i32 %flag, float %x, float %y, ; GCN-LABEL: test_if_export_vm_done_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_setprio 2 -; GCN-NEXT: s_mov_b32 s0, exec_lo -; GCN-NEXT: v_cmpx_ne_u32_e32 0, v0 -; GCN-NEXT: s_cbranch_execz .LBB12_2 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GCN-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB12_2 ; GCN-NEXT: ; %bb.1: ; %exp ; GCN-NEXT: exp mrt0 v1, v2, v3, v4 done ; GCN-NEXT: s_setprio 0 diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll index 8cb1d250a6fa72..01f8821d3250cd 100644 --- a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll @@ -12,13 +12,14 @@ define amdgpu_ps void @_amdgpu_ps_main(float %arg) { ; GFX900-NEXT: s_mov_b64 s[4:5], exec ; GFX900-NEXT: s_wqm_b64 exec, exec ; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: s_mov_b32 s0, 0 ; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX900-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX900-NEXT: s_mov_b32 s0, 0 +; GFX900-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX900-NEXT: ; implicit-def: $vgpr0 -; GFX900-NEXT: ; implicit-def: $sgpr2 -; GFX900-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX900-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX900-NEXT: s_cbranch_execz .LBB0_2 +; GFX900-NEXT: ; implicit-def: $sgpr1 +; GFX900-NEXT: s_cmov_b64 exec, vcc +; GFX900-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX900-NEXT: ; %bb.1: ; %bb1 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_mov_b32 s1, s0 @@ -33,14 +34,15 @@ define amdgpu_ps void @_amdgpu_ps_main(float %arg) { ; GFX900-NEXT: s_mov_b32 s14, s0 ; GFX900-NEXT: s_mov_b32 s15, s0 ; GFX900-NEXT: image_sample v[0:1], v[0:1], s[8:15], s[0:3] dmask:0x3 -; GFX900-NEXT: s_mov_b32 s2, 1.0 +; GFX900-NEXT: s_mov_b32 s1, 1.0 +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX900-NEXT: .LBB0_2: ; %Flow -; GFX900-NEXT: s_or_saveexec_b64 s[0:1], s[6:7] ; GFX900-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX900-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX900-NEXT: v_mov_b32_e32 v2, s2 -; GFX900-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX900-NEXT: s_cbranch_execz .LBB0_5 +; GFX900-NEXT: v_mov_b32_e32 v2, s1 +; GFX900-NEXT: s_xor_b64 s[0:1], s[6:7], exec +; GFX900-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX900-NEXT: s_cmov_b64 exec, s[6:7] +; GFX900-NEXT: s_cbranch_scc0 .LBB0_5 ; GFX900-NEXT: ; %bb.3: ; %bb5 ; GFX900-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GFX900-NEXT: s_cbranch_scc0 .LBB0_6 @@ -49,8 +51,8 @@ define amdgpu_ps void @_amdgpu_ps_main(float %arg) { ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: .LBB0_5: ; %bb6 ; GFX900-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX900-NEXT: .LBB0_5: ; %bb6 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cvt_pkrtz_f16_f32 v1, 0, v1 ; GFX900-NEXT: v_cvt_pkrtz_f16_f32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index f4776747f16ac1..d81c241b41979f 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -366,41 +366,44 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_xor_b32_e32 v1, v3, v13 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7] ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 -; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v6 -; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v7 ; GCN-IR-NEXT: v_min_u32_e32 v11, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v10, v11 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7] -; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v10, v11 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[8:9], -1 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] +; GCN-IR-NEXT: s_mov_b64 s[6:7], exec ; GCN-IR-NEXT: v_mov_b32_e32 v14, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v15, v13 -; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v7, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v6, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v7, 0, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v6, 0, s[8:9] +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[6:7], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[8:9], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v1, vcc @@ -418,34 +421,34 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v16, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v17, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: .LBB1_6: ; %udiv-end ; GCN-IR-NEXT: v_xor_b32_e32 v0, v13, v12 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v15, v14 ; GCN-IR-NEXT: v_xor_b32_e32 v3, v4, v0 @@ -1490,22 +1493,25 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec ; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB11_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc @@ -1521,34 +1527,34 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB11_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-IR-NEXT: .LBB11_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB11_6: ; %udiv-end ; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v12 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v13 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 @@ -1684,23 +1690,26 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec ; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 +; GCN-IR-NEXT: s_and_b64 s[10:11], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc @@ -1716,34 +1725,34 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB12_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-IR-NEXT: .LBB12_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB12_6: ; %udiv-end ; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v12 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v13 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 @@ -1780,26 +1789,29 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[0:1] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v10 +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[0:1] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[0:1] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec +; GCN-IR-NEXT: v_mov_b32_e32 v11, v10 ; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v5, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB13_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB13_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v0 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v0 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], v0 ; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB13_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[4:5], v6 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffcf, v8 @@ -1824,23 +1836,23 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 ; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 ; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 +; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 ; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB13_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v3, v1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v2, v0 -; GCN-IR-NEXT: .LBB13_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB13_6: ; %udiv-end ; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v10 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v3, v11 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll index 81858bd3d29ee0..e04d4f6a48de12 100644 --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -6,14 +6,19 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GCN-NEXT: s_mov_b32 s0, exec_lo +; GCN-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %.bb0 ; GCN-NEXT: v_mov_b32_e32 v3, 1 -; GCN-NEXT: ; %bb.2: ; %.merge ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GCN-NEXT: .LBB0_2: ; %.merge ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0 -; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GCN-NEXT: s_cbranch_execz .LBB0_4 +; GCN-NEXT: s_mov_b32 s0, exec_lo +; GCN-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %.then ; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo @@ -27,9 +32,9 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i ; GCN-NEXT: v_mov_b32_e32 v4, -1 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: buffer_store_dword v4, v0, s[4:7], 0 offen -; GCN-NEXT: .LBB0_4: ; %.end ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GCN-NEXT: .LBB0_4: ; %.end ; GCN-NEXT: v_mov_b32_e32 v0, -1 ; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen ; GCN-NEXT: s_endpgm @@ -65,21 +70,20 @@ define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrs ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GCN-NEXT: s_mov_b32 s0, exec_lo +; GCN-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %.bb0 ; GCN-NEXT: v_mov_b32_e32 v3, 1 -; GCN-NEXT: ; %bb.2: ; %.merge ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GCN-NEXT: .LBB1_2: ; %.merge ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0 -; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GCN-NEXT: s_xor_b32 s0, exec_lo, s0 -; GCN-NEXT: s_cbranch_execnz .LBB1_5 -; GCN-NEXT: ; %bb.3: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b32 s0, s0 -; GCN-NEXT: s_cbranch_execnz .LBB1_6 -; GCN-NEXT: .LBB1_4: ; %.end -; GCN-NEXT: s_endpgm -; GCN-NEXT: .LBB1_5: ; %.else +; GCN-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GCN-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB1_4 +; GCN-NEXT: ; %bb.3: ; %.else ; GCN-NEXT: s_or_saveexec_b32 s1, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 exec_lo, s1 @@ -94,11 +98,17 @@ define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrs ; GCN-NEXT: v_mov_b32_e32 v3, -1 ; GCN-NEXT: buffer_store_dword v3, v0, s[4:7], 0 offen ; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b32 s0, s0 -; GCN-NEXT: s_cbranch_execz .LBB1_4 -; GCN-NEXT: .LBB1_6: ; %.then +; GCN-NEXT: s_waitcnt_depctr 0xffe3 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GCN-NEXT: .LBB1_4: ; %Flow +; GCN-NEXT: s_xor_b32 s1, s0, exec_lo +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, s0 +; GCN-NEXT: s_cbranch_scc0 .LBB1_6 +; GCN-NEXT: ; %bb.5: ; %.then ; GCN-NEXT: v_mov_b32_e32 v0, -1 ; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen +; GCN-NEXT: .LBB1_6: ; %.end ; GCN-NEXT: s_endpgm .entry: %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll index 0630cca7c099b8..9e69416a82082a 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -108,27 +108,30 @@ endif: define amdgpu_kernel void @sgpr_if_else_valu_br(ptr addrspace(1) %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_valu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xc ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 -; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xc ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: s_xor_b64 s[0:1], vcc, exec +; SI-NEXT: s_and_b64 s[8:9], vcc, -1 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB2_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s8, s6, s7 +; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: .LBB2_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xor_b64 s[6:7], s[0:1], exec +; SI-NEXT: s_and_b64 s[10:11], s[0:1], -1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_xor_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execz .LBB2_4 +; SI-NEXT: s_cmov_b64 exec, s[0:1] +; SI-NEXT: s_cbranch_scc0 .LBB2_4 ; SI-NEXT: ; %bb.3: ; %if -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_add_i32 s0, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: .LBB2_4: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -160,13 +163,14 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, p ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[10:11], vcc, exec +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_and_b64 s[8:9], vcc, -1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: ; implicit-def: $sgpr8_sgpr9 -; SI-NEXT: s_and_saveexec_b64 s[10:11], vcc -; SI-NEXT: s_xor_b64 s[10:11], exec, s[10:11] -; SI-NEXT: s_cbranch_execz .LBB3_2 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB3_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -176,10 +180,13 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, p ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_or_b64 exec, exec, s[10:11] ; SI-NEXT: .LBB3_2: ; %Flow ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_andn2_saveexec_b64 s[0:1], s[10:11] -; SI-NEXT: s_cbranch_execz .LBB3_4 +; SI-NEXT: s_xor_b64 s[0:1], s[10:11], exec +; SI-NEXT: s_and_b64 s[2:3], s[10:11], -1 +; SI-NEXT: s_cmov_b64 exec, s[10:11] +; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.3: ; %if ; SI-NEXT: s_mov_b32 s15, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 @@ -191,8 +198,8 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, p ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-NEXT: s_or_b64 s[8:9], s[2:3], s[6:7] -; SI-NEXT: .LBB3_4: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: .LBB3_4: ; %endif ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll index d430ba758572d6..70aa8fbb3e03df 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -123,6 +123,9 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[24:25] +; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -142,9 +145,6 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: v_readlane_b32 s17, v1, 13 ; GCN-NEXT: v_readlane_b32 s18, v1, 14 ; GCN-NEXT: v_readlane_b32 s19, v1, 15 -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND @@ -202,7 +202,6 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: v_readlane_b32 s21, v1, 61 ; GCN-NEXT: v_readlane_b32 s22, v1, 62 ; GCN-NEXT: v_readlane_b32 s23, v1, 63 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-NEXT: v_readlane_b32 s5, v0, 1 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll index 09e342fe190666..a2b803d52b4b0c 100644 --- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll @@ -8,37 +8,46 @@ define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 i ; GCN-NEXT: v_cmp_eq_u32_e64 s5, 0, v0 ; GCN-NEXT: v_cmp_ne_u32_e64 s6, 0, v2 ; GCN-NEXT: s_mov_b32 s7, 0 -; GCN-NEXT: s_branch .LBB0_2 -; GCN-NEXT: .LBB0_1: ; %bb4 -; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GCN-NEXT: s_branch .LBB0_3 +; GCN-NEXT: .LBB0_1: ; %Flow +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GCN-NEXT: .LBB0_2: ; %bb4 +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; GCN-NEXT: s_and_b32 s8, exec_lo, s6 ; GCN-NEXT: s_or_b32 s7, s8, s7 -; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 -; GCN-NEXT: s_cbranch_execz .LBB0_5 -; GCN-NEXT: .LBB0_2: ; %bb +; GCN-NEXT: s_andn2_b32 s8, exec_lo, s7 +; GCN-NEXT: s_cselect_b32 exec_lo, s8, s7 +; GCN-NEXT: s_cbranch_scc0 .LBB0_6 +; GCN-NEXT: .LBB0_3: ; %bb ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GCN-NEXT: s_cbranch_execz .LBB0_1 -; GCN-NEXT: ; %bb.3: ; %bb1 -; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GCN-NEXT: s_mov_b32 s8, exec_lo +; GCN-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GCN-NEXT: s_cmov_b32 exec_lo, s9 +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 +; GCN-NEXT: ; %bb.4: ; %bb1 +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; GCN-NEXT: s_mov_b32 s9, exec_lo ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: s_or_saveexec_b32 s9, -1 +; GCN-NEXT: s_or_saveexec_b32 s10, -1 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf -; GCN-NEXT: s_mov_b32 exec_lo, s9 +; GCN-NEXT: s_mov_b32 exec_lo, s10 ; GCN-NEXT: v_mov_b32_e32 v0, v4 -; GCN-NEXT: s_and_b32 exec_lo, exec_lo, s5 -; GCN-NEXT: s_cbranch_execz .LBB0_1 -; GCN-NEXT: ; %bb.4: ; %bb2 -; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GCN-NEXT: s_and_b32 s10, s5, exec_lo +; GCN-NEXT: s_cmov_b32 exec_lo, s10 +; GCN-NEXT: s_cbranch_scc0 .LBB0_1 +; GCN-NEXT: ; %bb.5: ; %bb2 +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; GCN-NEXT: buffer_atomic_add v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt_depctr 0xffe3 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GCN-NEXT: s_branch .LBB0_1 -; GCN-NEXT: .LBB0_5: ; %bb5 +; GCN-NEXT: .LBB0_6: ; %bb5 ; GCN-NEXT: s_endpgm .entry: br label %bb diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll index cef959f45437db..81db28e3491e9d 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll @@ -7,24 +7,29 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: s_mov_b64 s[0:1], exec -; SI-NEXT: s_mov_b64 s[2:3], -1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[2:3], vcc, -1 +; SI-NEXT: s_mov_b64 s[2:3], -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB0_2 ; SI-NEXT: ; %bb.1: ; %if1 ; SI-NEXT: s_xor_b64 s[2:3], exec, -1 -; SI-NEXT: ; %bb.2: ; %endif1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: .LBB0_2: ; %endif1 ; SI-NEXT: s_wqm_b64 s[4:5], s[2:3] ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], exec ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; SI-NEXT: s_cbranch_scc0 .LBB0_6 ; SI-NEXT: ; %bb.3: ; %endif1 ; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] -; SI-NEXT: s_cbranch_execz .LBB0_5 +; SI-NEXT: s_cmov_b64 exec, s[2:3] +; SI-NEXT: s_cbranch_scc0 .LBB0_5 ; SI-NEXT: ; %bb.4: ; %if2 ; SI-NEXT: s_mov_b32 s3, 0 ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 @@ -36,8 +41,8 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) { ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_cvt_f32_i32_e32 v0, v0 -; SI-NEXT: .LBB0_5: ; %endif2 ; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: .LBB0_5: ; %endif2 ; SI-NEXT: s_branch .LBB0_7 ; SI-NEXT: .LBB0_6: ; SI-NEXT: s_mov_b64 exec, 0 @@ -53,20 +58,25 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) { ; FLAT-NEXT: v_or_b32_e32 v0, v1, v0 ; FLAT-NEXT: v_and_b32_e32 v0, 1, v0 ; FLAT-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; FLAT-NEXT: s_and_saveexec_b64 s[4:5], vcc +; FLAT-NEXT: s_xor_b64 s[4:5], vcc, exec +; FLAT-NEXT: s_cmp_lg_u64 vcc, 0 +; FLAT-NEXT: s_cmov_b64 exec, vcc +; FLAT-NEXT: s_cbranch_scc0 .LBB0_2 ; FLAT-NEXT: ; %bb.1: ; %if1 ; FLAT-NEXT: s_xor_b64 s[2:3], exec, -1 -; FLAT-NEXT: ; %bb.2: ; %endif1 ; FLAT-NEXT: s_or_b64 exec, exec, s[4:5] +; FLAT-NEXT: .LBB0_2: ; %endif1 ; FLAT-NEXT: s_wqm_b64 s[4:5], s[2:3] ; FLAT-NEXT: s_xor_b64 s[4:5], s[4:5], exec ; FLAT-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; FLAT-NEXT: s_cbranch_scc0 .LBB0_6 ; FLAT-NEXT: ; %bb.3: ; %endif1 ; FLAT-NEXT: s_and_b64 exec, exec, s[0:1] +; FLAT-NEXT: s_mov_b64 s[0:1], exec +; FLAT-NEXT: s_and_b64 s[2:3], s[2:3], exec ; FLAT-NEXT: v_mov_b32_e32 v0, 0 -; FLAT-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] -; FLAT-NEXT: s_cbranch_execz .LBB0_5 +; FLAT-NEXT: s_cmov_b64 exec, s[2:3] +; FLAT-NEXT: s_cbranch_scc0 .LBB0_5 ; FLAT-NEXT: ; %bb.4: ; %if2 ; FLAT-NEXT: s_mov_b32 s3, 0 ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 @@ -78,8 +88,8 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) { ; FLAT-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:4 glc ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_cvt_f32_i32_e32 v0, v0 -; FLAT-NEXT: .LBB0_5: ; %endif2 ; FLAT-NEXT: s_or_b64 exec, exec, s[0:1] +; FLAT-NEXT: .LBB0_5: ; %endif2 ; FLAT-NEXT: s_branch .LBB0_7 ; FLAT-NEXT: .LBB0_6: ; FLAT-NEXT: s_mov_b64 exec, 0 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll index 9f3596359a6625..f986f9d00b1821 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -16,10 +16,10 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_and_b64 s[4:5], exec, vcc ; SI-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB0_1 +; SI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; SI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB0_1 ; SI-NEXT: ; %bb.2: ; %ENDLOOP -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -41,10 +41,10 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLAT-NEXT: s_and_b64 s[4:5], exec, vcc ; FLAT-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; FLAT-NEXT: s_andn2_b64 exec, exec, s[0:1] -; FLAT-NEXT: s_cbranch_execnz .LBB0_1 +; FLAT-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; FLAT-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; FLAT-NEXT: s_cbranch_scc1 .LBB0_1 ; FLAT-NEXT: ; %bb.2: ; %ENDLOOP -; FLAT-NEXT: s_or_b64 exec, exec, s[0:1] ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 @@ -71,50 +71,54 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, -1 +; SI-NEXT: s_mov_b64 s[6:7], exec ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SI-NEXT: s_cbranch_execz .LBB1_2 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB1_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_load_dword s2, s[2:3], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: s_and_b64 s[4:5], s[2:3], exec -; SI-NEXT: .LBB1_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[6:7] -; SI-NEXT: .LBB1_3: ; %loop +; SI-NEXT: .LBB1_2: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_and_b64 s[2:3], exec, s[4:5] ; SI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB1_3 -; SI-NEXT: ; %bb.4: ; %exit +; SI-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; SI-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB1_2 +; SI-NEXT: ; %bb.3: ; %exit ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: phi_cond_outside_loop: ; FLAT: ; %bb.0: ; %entry ; FLAT-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; FLAT-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; FLAT-NEXT: s_mov_b64 s[6:7], exec ; FLAT-NEXT: s_mov_b64 s[0:1], 0 ; FLAT-NEXT: s_mov_b64 s[4:5], 0 -; FLAT-NEXT: s_and_saveexec_b64 s[6:7], vcc -; FLAT-NEXT: s_cbranch_execz .LBB1_2 +; FLAT-NEXT: s_cmp_lg_u64 vcc, 0 +; FLAT-NEXT: s_cmov_b64 exec, vcc +; FLAT-NEXT: s_cbranch_scc0 .LBB1_2 ; FLAT-NEXT: ; %bb.1: ; %else ; FLAT-NEXT: s_load_dword s2, s[2:3], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: s_cmp_eq_u32 s2, 0 ; FLAT-NEXT: s_cselect_b64 s[2:3], -1, 0 ; FLAT-NEXT: s_and_b64 s[4:5], s[2:3], exec -; FLAT-NEXT: .LBB1_2: ; %endif ; FLAT-NEXT: s_or_b64 exec, exec, s[6:7] -; FLAT-NEXT: .LBB1_3: ; %loop +; FLAT-NEXT: .LBB1_2: ; %loop ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLAT-NEXT: s_and_b64 s[2:3], exec, s[4:5] ; FLAT-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; FLAT-NEXT: s_andn2_b64 exec, exec, s[0:1] -; FLAT-NEXT: s_cbranch_execnz .LBB1_3 -; FLAT-NEXT: ; %bb.4: ; %exit +; FLAT-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; FLAT-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; FLAT-NEXT: s_cbranch_scc1 .LBB1_2 +; FLAT-NEXT: ; %bb.3: ; %exit ; FLAT-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll index 15f6bb632f3113..886e14f91d9818 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll @@ -18,12 +18,13 @@ define amdgpu_ps i32 @if_else(i32 %0) !dbg !5 { ; OPT-NEXT: [[TMP8:%.*]] = extractvalue { i1, i64 } [[TMP6]], 1, !dbg [[DBG14]] ; OPT-NEXT: br i1 [[TMP7]], label [[TRUE:%.*]], label [[EXIT:%.*]], !dbg [[DBG14]] ; OPT: true: -; OPT-NEXT: br label [[EXIT]], !dbg [[DBG15:![0-9]+]] +; OPT-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP8]]), !dbg [[DBG15:![0-9]+]] +; OPT-NEXT: br label [[EXIT]], !dbg [[DBG15]] ; OPT: false: -; OPT-NEXT: br label [[FLOW]], !dbg [[DBG16:![0-9]+]] +; OPT-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP4]]), !dbg [[DBG16:![0-9]+]] +; OPT-NEXT: br label [[FLOW]], !dbg [[DBG16]] ; OPT: exit: ; OPT-NEXT: [[RET:%.*]] = phi i32 [ [[TMP5]], [[FLOW]] ], [ 42, [[TRUE]] ], !dbg [[DBG17:![0-9]+]] -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) ; OPT-NEXT: #dbg_value(i32 [[RET]], [[META11:![0-9]+]], !DIExpression(), [[DBG17]]) ; OPT-NEXT: ret i32 [[RET]], !dbg [[DBG18:![0-9]+]] ; @@ -61,16 +62,15 @@ define amdgpu_ps void @loop_if_break(i32 %n) !dbg !19 { ; OPT: loop_body: ; OPT-NEXT: [[I_NEXT:%.*]] = sub i32 [[I]], 1, !dbg [[DBG28:![0-9]+]] ; OPT-NEXT: #dbg_value(i32 [[I_NEXT]], [[META23:![0-9]+]], !DIExpression(), [[DBG28]]) -; OPT-NEXT: br label [[FLOW]], !dbg [[DBG29:![0-9]+]] +; OPT-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP2]]), !dbg [[DBG29:![0-9]+]] +; OPT-NEXT: br label [[FLOW]], !dbg [[DBG29]] ; OPT: Flow: ; OPT-NEXT: [[TMP3]] = phi i32 [ [[I_NEXT]], [[LOOP_BODY]] ], [ undef, [[LOOP]] ] ; OPT-NEXT: [[TMP4:%.*]] = phi i1 [ false, [[LOOP_BODY]] ], [ true, [[LOOP]] ] -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]) ; OPT-NEXT: [[TMP5]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN]]), !dbg [[DBG27]] ; OPT-NEXT: [[TMP6:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP5]]), !dbg [[DBG27]] ; OPT-NEXT: br i1 [[TMP6]], label [[EXIT:%.*]], label [[LOOP]], !dbg [[DBG27]] ; OPT: exit: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP5]]) ; OPT-NEXT: ret void, !dbg [[DBG30:![0-9]+]] ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll b/llvm/test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll index 0edd9f4cd6b4f5..3bf0926f21d76d 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @multiple_backedges(i32 %arg, ptr %arg1) { ; OPT-NEXT: [[TMP2:%.*]] = shl nsw i32 [[ARG:%.*]], 1 ; OPT-NEXT: br label [[LOOP:%.*]] ; OPT: loop: -; OPT-NEXT: [[PHI_BROKEN1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOOP_END:%.*]] ], [ [[PHI_BROKEN1]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; OPT-NEXT: [[PHI_BROKEN1:%.*]] = phi i64 [ [[TMP2:%.*]], [[LOOP_END:%.*]] ], [ [[PHI_BROKEN1]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] ; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ 0, [[LOOP_END]] ], [ [[TMP0:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] ; OPT-NEXT: [[TMP4:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP5:%.*]], [[LOOP]] ], [ 0, [[LOOP_END]] ] ; OPT-NEXT: [[TMP5]] = add nsw i32 [[TMP4]], [[TMP]] @@ -21,13 +21,11 @@ define amdgpu_kernel void @multiple_backedges(i32 %arg, ptr %arg1) { ; OPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]]) ; OPT-NEXT: br i1 [[TMP1]], label [[LOOP_END]], label [[LOOP]] ; OPT: loop_end: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]]) ; OPT-NEXT: [[EXIT:%.*]] = icmp sgt i32 [[TMP5]], [[TMP2]] -; OPT-NEXT: [[TMP7]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[EXIT]], i64 [[PHI_BROKEN1]]) -; OPT-NEXT: [[TMP3:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP7]]) +; OPT-NEXT: [[TMP2]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[EXIT]], i64 [[PHI_BROKEN1]]) +; OPT-NEXT: [[TMP3:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP2]]) ; OPT-NEXT: br i1 [[TMP3]], label [[LOOP_EXIT:%.*]], label [[LOOP]] ; OPT: loop_exit: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]]) ; OPT-NEXT: [[TMP12:%.*]] = zext i32 [[TMP]] to i64 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG1:%.*]], i64 [[TMP12]] ; OPT-NEXT: [[TMP14:%.*]] = addrspacecast ptr [[TMP13]] to ptr addrspace(1) diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir index f1b88c76162985..c3a7b2b4645126 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir @@ -27,11 +27,13 @@ body: | %0 = PHI %8, %bb.0, %0, %bb.1, %2, %bb.2 %9 = V_MOV_B32_e32 9, implicit $exec %10 = V_CMP_EQ_U32_e64 %7, %9, implicit $exec - %1 = SI_IF %10, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + %1 = SI_IF %10, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec S_BRANCH %bb.1 + bb.3: + SI_WAVE_RECONVERGE %1, implicit-def $exec, implicit-def $scc, implicit $exec + S_BRANCH %bb.2 bb.2: - SI_END_CF %1, implicit-def $exec, implicit-def $scc, implicit $exec %11 = S_MOV_B32 1 %2 = S_ADD_I32 %0, %11, implicit-def $scc S_BRANCH %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir index eddad05d976bd3..fc2e81ad29d42a 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir +++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir @@ -28,12 +28,12 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: dead [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], [[COPY]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) @@ -67,19 +67,17 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; GCN-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] - ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc + ; GCN-NEXT: dead [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: S_ENDPGM 0 @@ -97,7 +95,6 @@ body: | successors: %bb.2 %6:sreg_64_xexec = COPY %5 - SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec bb.2: S_ENDPGM 0 @@ -116,26 +113,19 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; GCN-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: dead [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] + ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_SLEEP 3 ; GCN-NEXT: S_NOP 0, implicit $vgpr0, implicit $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} @@ -157,7 +147,6 @@ body: | %6:sreg_64_xexec = COPY %5 S_NOP 0 - SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec S_SLEEP 3 S_NOP 0, implicit $vgpr0, implicit $sgpr4_sgpr5 @@ -178,25 +167,18 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; GCN-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11:0x0000000000000003 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] - ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10 + ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11:0x0000000000000003 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: dead [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] ; GCN-NEXT: S_SLEEP 3 ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: {{ $}} @@ -219,7 +201,6 @@ body: | liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11:0x00000003 %6:sreg_64_xexec = COPY %5 - SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec S_SLEEP 3 S_NOP 0 @@ -241,23 +222,17 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; GCN-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] - ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: dead [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: $sgpr4_sgpr5 = S_MOV_B64 32 ; GCN-NEXT: {{ $}} @@ -279,7 +254,6 @@ body: | successors: %bb.2 %6:sreg_64_xexec = COPY %5 - SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec $sgpr4_sgpr5 = S_MOV_B64 32 @@ -301,26 +275,19 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; GCN-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] - ; GCN-NEXT: $sgpr4_sgpr5 = S_MOV_B64 32 - ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: dead [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] + ; GCN-NEXT: $sgpr4_sgpr5 = S_MOV_B64 32 ; GCN-NEXT: S_SLEEP 3, implicit $sgpr4_sgpr5 ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: {{ $}} @@ -344,7 +311,6 @@ body: | %6:sreg_64_xexec = COPY %5 $sgpr4_sgpr5 = S_MOV_B64 32 - SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec S_SLEEP 3, implicit $sgpr4_sgpr5 S_NOP 0 @@ -371,20 +337,15 @@ body: | ; GCN-NEXT: dead [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[V_CMP_EQ_U32_e64_]] - ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY4]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] + ; GCN-NEXT: dead [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[V_CMP_EQ_U32_e64_]] + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_1]], $exec, implicit-def $scc + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc ; GCN-NEXT: dead [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: S_ENDPGM 0 @@ -402,7 +363,6 @@ body: | successors: %bb.2 %6:sreg_64_xexec = COPY %3 - SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec %7:sreg_64_xexec = SI_IF %4, %bb.2, implicit-def $exec, implicit-def dead $scc, implicit $exec %8:sreg_64_xexec = S_MOV_B64_term %7, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir index ecbd47a9e8d0dd..da57e211d88714 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir +++ b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir @@ -61,12 +61,12 @@ body: | ; GCN-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc ; GCN-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc ; GCN-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_1]], implicit-def $scc + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.1, [[S_OR_B32_1]], %bb.2 - ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4 ; GCN-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[PHI3]], killed [[S_MOV_B64_]], implicit-def dead $vcc_lo, implicit $exec ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1 @@ -124,12 +124,12 @@ body: | %24:sreg_32 = S_MOV_B32 0 %25:sreg_32 = V_CMP_EQ_U32_e64 killed %23, killed %24, implicit $exec %26:vreg_1 = COPY %25 + SI_WAVE_RECONVERGE %22, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: successors: %bb.4(0x04000000), %bb.1(0x7c000000) %20:vreg_1 = PHI %26, %bb.2, %19, %bb.1 ;%20:vreg_1 = PHI %19, %bb.1, %26, %bb.2 - this is original phi created by SDAG - SI_END_CF %22, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %27:sreg_64 = S_MOV_B64 4 %18:vreg_64 = V_ADD_U64_PSEUDO %17, killed %27, implicit-def dead $vcc, implicit $exec %28:sreg_32 = S_MOV_B32 1 diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir index 502116b121d949..0aa37723da41ba 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir +++ b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir @@ -24,11 +24,10 @@ body: | bb.2: %6:vreg_1 = PHI %5, %bb.1 - SI_END_CF %3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: %7:vreg_1 = PHI %6, %bb.2, %8, %bb.0 - SI_END_CF %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir b/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir index f234ea24a9fe7a..df933174e0d5c9 100644 --- a/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir +++ b/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir @@ -55,10 +55,10 @@ body: | ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[PHI1]], %subreg.sub0, [[PHI1]], %subreg.sub1, [[PHI1]], %subreg.sub2, undef %6:vgpr_32, %subreg.sub3 + ; CHECK-NEXT: SI_WAVE_RECONVERGE killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vreg_128 = PHI [[PHI]], %bb.2, [[REG_SEQUENCE1]], %bb.3 - ; CHECK-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: dead [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[PHI2]].sub2, %subreg.sub0, [[PHI2]].sub2, %subreg.sub1, [[PHI2]].sub2, %subreg.sub2, undef [[BUFFER_LOAD_DWORD_OFFEN]], %subreg.sub3 ; CHECK-NEXT: S_ENDPGM 0 bb.0: @@ -88,10 +88,10 @@ body: | successors: %bb.8(0x80000000) %12:vreg_128 = REG_SEQUENCE %3, %subreg.sub0, %3, %subreg.sub1, killed %3, %subreg.sub2, undef %7, %subreg.sub3 + SI_WAVE_RECONVERGE killed %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.8: %13:vreg_128 = PHI %10, %bb.6, %12, %bb.7 - SI_END_CF killed %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %5:vreg_128 = REG_SEQUENCE %13.sub2, %subreg.sub0, %13.sub2, %subreg.sub1, killed %13.sub2, %subreg.sub2, undef %3, %subreg.sub3 S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll index d34769ad0fcf0a..5805ad4ea234db 100644 --- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll +++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll @@ -16,30 +16,30 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GCN-NEXT: s_cbranch_execnz .LBB0_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB0_4 -; GCN-NEXT: .LBB0_2: ; %bb3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB0_3: ; %bb2 +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GCN-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %bb2 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: v_mov_b32_e32 v4, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: .LBB0_4: ; %bb1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB0_2: ; %Flow +; GCN-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 +; GCN-NEXT: ; %bb.3: ; %bb1 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: v_mov_b32_e32 v4, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[1:2], v[3:4] -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: .LBB0_4: ; %bb3 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.mir b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.mir index 3bdcc14936fb9b..5fb5ff642b7aaf 100644 --- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.mir +++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.mir @@ -94,6 +94,7 @@ body: | %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %9:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %8, %subreg.sub1 FLAT_STORE_DWORDX2 %5, killed %9, 0, 0, implicit $exec, implicit $flat_scr + SI_WAVE_RECONVERGE %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.4 bb.3: @@ -105,7 +106,6 @@ body: | S_BRANCH %bb.1 bb.4: - SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec DBG_VALUE_LIST !4, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_constu, 2712, DW_OP_mul, DW_OP_plus, DW_OP_plus_uconst, 2680, DW_OP_stack_value), %5, 0, debug-location !9 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index 1d183210f95380..52fda17dee8075 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -8,53 +8,22 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; This used to bypass the structurization process because structurizer is unable to ; handle multiple-exits CFG. This should be correctly structurized. -; UNIFY-LABEL: define amdgpu_kernel void @kernel -; UNIFY-LABEL: entry: -; UNIFY: %tid = call i32 @llvm.amdgcn.workitem.id.x() -; UNIFY-NEXT: %cmp = icmp eq i32 %n.load, 256 -; UNIFY-NEXT: br i1 %cmp, label %if.then, label %if.else -; UNIFY-LABEL: if.then: -; UNIFY-NEXT: %cmp1 = icmp eq i32 %a.load, 0 -; UNIFY-NEXT: br i1 %cmp1, label %if.end6.sink.split, label %cond.false -; UNIFY-LABEL: cond.false: -; UNIFY-NEXT: call void @llvm.trap() -; UNIFY-NEXT: br label %UnifiedUnreachableBlock -; UNIFY-LABEL: if.else: -; UNIFY-NEXT: %cmp2 = icmp ult i32 %tid, 10 -; UNIFY-NEXT: br i1 %cmp2, label %if.then3, label %UnifiedReturnBlock -; UNIFY-LABEL: if.then3: -; UNIFY-NEXT: %cmp1.i7 = icmp eq i32 %a.load, 0 -; UNIFY-NEXT: br i1 %cmp1.i7, label %if.end6.sink.split, label %cond.false.i8 -; UNIFY-LABEL: cond.false.i8: -; UNIFY-NEXT: call void @llvm.trap() -; UNIFY-NEXT: br label %UnifiedUnreachableBlock -; UNIFY-LABEL: if.end6.sink.split: -; UNIFY-NEXT: %x.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %kernel.kernarg.segment, i64 8 -; UNIFY-NEXT: %x.load = load ptr addrspace(1), ptr addrspace(4) %x.kernarg.offset, align 8, !invariant.load !0 -; UNIFY-NEXT: %idxprom = sext i32 %tid to i64 -; UNIFY-NEXT: %x1 = getelementptr inbounds i32, ptr addrspace(1) %x.load, i64 %idxprom -; UNIFY-NEXT: store i32 %a.load, ptr addrspace(1) %x1, align 4 -; UNIFY-NEXT: br label %UnifiedReturnBlock -; UNIFY-LABEL: UnifiedUnreachableBlock: -; UNIFY-NEXT: call void @llvm.amdgcn.unreachable() -; UNIFY-NEXT: br label %UnifiedReturnBlock -; UNIFY-LABEL: UnifiedReturnBlock: -; UNIFY-NEXT: ret void - ; CHECK-LABEL: kernel: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dword s0, s[6:7], 0x10 ; CHECK-NEXT: s_load_dword s10, s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmpk_lg_i32 s0, 0x100 -; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_5 ; CHECK-NEXT: ; %bb.1: ; %if.else ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0 +; CHECK-NEXT: s_xor_b64 s[8:9], vcc, exec ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc -; CHECK-NEXT: s_cbranch_execz .LBB0_5 +; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 ; CHECK-NEXT: ; %bb.2: ; %if.then3 ; CHECK-NEXT: s_cmp_lg_u32 s10, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_14 @@ -63,27 +32,31 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: .LBB0_4: ; %Flow3 ; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CHECK-NEXT: s_and_b64 s[2:3], s[2:3], exec -; CHECK-NEXT: .LBB0_5: ; %Flow2 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] -; CHECK-NEXT: s_cbranch_vccz .LBB0_8 -; CHECK-NEXT: s_branch .LBB0_7 -; CHECK-NEXT: .LBB0_6: +; CHECK-NEXT: s_branch .LBB0_6 +; CHECK-NEXT: .LBB0_5: +; CHECK-NEXT: s_mov_b64 s[4:5], -1 ; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_execz .LBB0_8 -; CHECK-NEXT: .LBB0_7: ; %if.then +; CHECK-NEXT: .LBB0_6: ; %Flow +; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] +; CHECK-NEXT: s_cbranch_vccz .LBB0_8 +; CHECK-NEXT: ; %bb.7: ; %if.then ; CHECK-NEXT: s_cmp_lg_u32 s10, 0 ; CHECK-NEXT: s_mov_b64 s[0:1], -1 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_13 ; CHECK-NEXT: .LBB0_8: ; %Flow4 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; CHECK-NEXT: .LBB0_9: ; %UnifiedUnreachableBlock +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: s_and_b64 s[2:3], s[2:3], exec +; CHECK-NEXT: s_cmov_b64 exec, s[2:3] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_10 +; CHECK-NEXT: ; %bb.9: ; %UnifiedUnreachableBlock ; CHECK-NEXT: ; divergent unreachable -; CHECK-NEXT: .LBB0_10: ; %Flow6 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; CHECK-NEXT: s_cbranch_execz .LBB0_12 +; CHECK-NEXT: .LBB0_10: ; %Flow6 +; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_cmov_b64 exec, s[0:1] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_12 ; CHECK-NEXT: ; %bb.11: ; %if.end6.sink.split ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -96,14 +69,13 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], exec ; CHECK-NEXT: s_trap 2 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; CHECK-NEXT: s_cbranch_execnz .LBB0_9 -; CHECK-NEXT: s_branch .LBB0_10 +; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_14: ; %cond.false.i8 ; CHECK-NEXT: s_mov_b64 s[2:3], -1 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 + entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp eq i32 %n, 256 @@ -137,3 +109,5 @@ if.end6.sink.split: if.end6: ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; UNIFY: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll index 1eef7b967f6d99..c3de2570779ed5 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -verify -S %s -o - | FileCheck -check-prefix=IR %s @@ -47,6 +46,7 @@ define void @my_func(i32 %0) { ; IR-NEXT: br i1 [[TMP12]], label [[LEAFBLOCK5:%.*]], label [[FLOW13:%.*]] ; IR: LeafBlock5: ; IR-NEXT: [[SWITCHLEAF6:%.*]] = icmp eq i32 [[TMP0]], 2 +; IR-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP13]]) ; IR-NEXT: br label [[FLOW13]] ; IR: Flow13: ; IR-NEXT: [[TMP14:%.*]] = phi i1 [ true, [[LEAFBLOCK5]] ], [ false, [[NODEBLOCK7]] ] @@ -58,11 +58,11 @@ define void @my_func(i32 %0) { ; IR: LeafBlock3: ; IR-NEXT: [[SWITCHLEAF4:%.*]] = icmp eq i32 [[TMP0]], 0 ; IR-NEXT: [[SWITCHLEAF4_INV:%.*]] = xor i1 [[SWITCHLEAF4]], true +; IR-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP18]]) ; IR-NEXT: br label [[FLOW14]] ; IR: Flow14: ; IR-NEXT: [[TMP19:%.*]] = phi i1 [ [[SWITCHLEAF4_INV]], [[LEAFBLOCK3]] ], [ [[TMP14]], [[FLOW13]] ] ; IR-NEXT: [[TMP20:%.*]] = phi i1 [ [[SWITCHLEAF4]], [[LEAFBLOCK3]] ], [ [[TMP15]], [[FLOW13]] ] -; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP18]]) ; IR-NEXT: [[TMP21:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP20]]) ; IR-NEXT: [[TMP22:%.*]] = extractvalue { i1, i64 } [[TMP21]], 0 ; IR-NEXT: [[TMP23:%.*]] = extractvalue { i1, i64 } [[TMP21]], 1 @@ -72,7 +72,6 @@ define void @my_func(i32 %0) { ; IR: Flow15: ; IR-NEXT: [[TMP24]] = phi i1 [ [[TMP29:%.*]], [[FLOW16:%.*]] ], [ false, [[FLOW14]] ] ; IR-NEXT: [[TMP25]] = phi i1 [ [[TMP30:%.*]], [[FLOW16]] ], [ [[TMP19]], [[FLOW14]] ] -; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP23]]) ; IR-NEXT: br label [[FLOW12]] ; IR: LeafBlock9: ; IR-NEXT: [[SWITCHLEAF10:%.*]] = icmp sgt i32 [[TMP0]], 1 @@ -82,27 +81,28 @@ define void @my_func(i32 %0) { ; IR-NEXT: br i1 [[TMP27]], label [[DO_BODY_I_I_I_I:%.*]], label [[FLOW16]] ; IR: do.body.i.i.i.i: ; IR-NEXT: tail call fastcc void null() +; IR-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP28]]) ; IR-NEXT: br label [[FLOW16]] ; IR: Flow16: ; IR-NEXT: [[TMP29]] = phi i1 [ true, [[DO_BODY_I_I_I_I]] ], [ false, [[LEAFBLOCK9]] ] ; IR-NEXT: [[TMP30]] = phi i1 [ false, [[DO_BODY_I_I_I_I]] ], [ true, [[LEAFBLOCK9]] ] -; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP28]]) +; IR-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP23]]) ; IR-NEXT: br label [[FLOW15]] ; IR: do.body: ; IR-NEXT: tail call fastcc void null() +; IR-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP8]]) ; IR-NEXT: br label [[FLOW17]] ; IR: Flow17: ; IR-NEXT: [[TMP31:%.*]] = phi i1 [ true, [[DO_BODY]] ], [ [[TMP4]], [[FLOW11]] ] -; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) ; IR-NEXT: [[TMP32:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP31]]) ; IR-NEXT: [[TMP33:%.*]] = extractvalue { i1, i64 } [[TMP32]], 0 ; IR-NEXT: [[TMP34:%.*]] = extractvalue { i1, i64 } [[TMP32]], 1 ; IR-NEXT: br i1 [[TMP33]], label [[UNIFIEDUNREACHABLEBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] ; IR: UnifiedUnreachableBlock: ; IR-NEXT: call void @llvm.amdgcn.unreachable() +; IR-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP34]]) ; IR-NEXT: br label [[UNIFIEDRETURNBLOCK]] ; IR: UnifiedReturnBlock: -; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP34]]) ; IR-NEXT: ret void ; ; GCN-LABEL: my_func: diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index eebd32cd67e6e6..d5ceb164cc5208 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -936,11 +936,12 @@ exit: define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; SI-LABEL: test_kill_divergent_loop: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB10_4 +; SI-NEXT: s_xor_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_and_b64 s[2:3], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB10_5 ; SI-NEXT: ; %bb.1: ; %bb.preheader ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -961,7 +962,7 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; SI-NEXT: ;;#ASMEND ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc -; SI-NEXT: s_cbranch_scc0 .LBB10_5 +; SI-NEXT: s_cbranch_scc0 .LBB10_6 ; SI-NEXT: ; %bb.3: ; %bb ; SI-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; SI-NEXT: s_andn2_b64 exec, exec, vcc @@ -969,15 +970,16 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: s_cbranch_vccnz .LBB10_2 -; SI-NEXT: .LBB10_4: ; %Flow1 +; SI-NEXT: ; %bb.4: ; %Flow ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: .LBB10_5: ; %exit ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm -; SI-NEXT: .LBB10_5: +; SI-NEXT: .LBB10_6: ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: exp null off, off, off, off done vm ; SI-NEXT: s_endpgm @@ -986,9 +988,10 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE64: ; %bb.0: ; %entry ; GFX10-WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB10_3 +; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10-WAVE64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-WAVE64-NEXT: s_cmov_b64 exec, vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX10-WAVE64-NEXT: .LBB10_1: ; %bb ; GFX10-WAVE64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-WAVE64-NEXT: ;;#ASMSTART @@ -1006,7 +1009,7 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE64-NEXT: ;;#ASMEND ; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 ; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc -; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB10_4 +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb ; GFX10-WAVE64-NEXT: ; in Loop: Header=BB10_1 Depth=1 ; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc @@ -1014,13 +1017,14 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10-WAVE64-NEXT: s_cbranch_vccnz .LBB10_1 -; GFX10-WAVE64-NEXT: .LBB10_3: ; %Flow1 +; GFX10-WAVE64-NEXT: ; %bb.3: ; %Flow ; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-WAVE64-NEXT: .LBB10_4: ; %exit ; GFX10-WAVE64-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-WAVE64-NEXT: global_store_dword v[0:1], v0, off ; GFX10-WAVE64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WAVE64-NEXT: s_endpgm -; GFX10-WAVE64-NEXT: .LBB10_4: +; GFX10-WAVE64-NEXT: .LBB10_5: ; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 ; GFX10-WAVE64-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE64-NEXT: s_endpgm @@ -1029,9 +1033,10 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE32: ; %bb.0: ; %entry ; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB10_3 +; GFX10-WAVE32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10-WAVE32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX10-WAVE32-NEXT: .LBB10_1: ; %bb ; GFX10-WAVE32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-WAVE32-NEXT: ;;#ASMSTART @@ -1049,7 +1054,7 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE32-NEXT: ;;#ASMEND ; GFX10-WAVE32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v7 ; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, vcc_lo -; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB10_4 +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb ; GFX10-WAVE32-NEXT: ; in Loop: Header=BB10_1 Depth=1 ; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo @@ -1057,24 +1062,26 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-WAVE32-NEXT: s_cbranch_vccnz .LBB10_1 -; GFX10-WAVE32-NEXT: .LBB10_3: ; %Flow1 +; GFX10-WAVE32-NEXT: ; %bb.3: ; %Flow ; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-WAVE32-NEXT: .LBB10_4: ; %exit ; GFX10-WAVE32-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-WAVE32-NEXT: global_store_dword v[0:1], v0, off ; GFX10-WAVE32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WAVE32-NEXT: s_endpgm -; GFX10-WAVE32-NEXT: .LBB10_4: +; GFX10-WAVE32-NEXT: .LBB10_5: ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm ; ; GFX11-LABEL: test_kill_divergent_loop: ; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11-NEXT: s_mov_b64 s[0:1], exec -; GFX11-NEXT: s_mov_b64 s[2:3], exec -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX11-NEXT: s_cbranch_execz .LBB10_3 +; GFX11-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX11-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX11-NEXT: .LBB10_1: ; %bb ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: ;;#ASMSTART @@ -1092,7 +1099,7 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 ; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc -; GFX11-NEXT: s_cbranch_scc0 .LBB10_4 +; GFX11-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX11-NEXT: ; %bb.2: ; %bb ; GFX11-NEXT: ; in Loop: Header=BB10_1 Depth=1 ; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc @@ -1100,15 +1107,16 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11-NEXT: s_cbranch_vccnz .LBB10_1 -; GFX11-NEXT: .LBB10_3: ; %Flow1 +; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11-NEXT: .LBB10_4: ; %exit ; GFX11-NEXT: v_mov_b32_e32 v0, 8 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm -; GFX11-NEXT: .LBB10_4: +; GFX11-NEXT: .LBB10_5: ; GFX11-NEXT: s_mov_b64 exec, 0 ; GFX11-NEXT: exp mrt0 off, off, off, off done ; GFX11-NEXT: s_endpgm @@ -1402,22 +1410,24 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB13_3 +; SI-NEXT: s_xor_b64 s[2:3], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB13_3 ; SI-NEXT: ; %bb.1: ; %bb3 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc ; SI-NEXT: s_cbranch_scc0 .LBB13_6 ; SI-NEXT: ; %bb.2: ; %bb3 ; SI-NEXT: s_andn2_b64 exec, exec, vcc -; SI-NEXT: .LBB13_3: ; %bb4 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: .LBB13_3: ; %bb4 ; SI-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB13_5 +; SI-NEXT: s_and_b64 s[0:1], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB13_5 ; SI-NEXT: ; %bb.4: ; %bb8 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1436,22 +1446,24 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-WAVE64-NEXT: s_wqm_b64 exec, exec ; GFX10-WAVE64-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB13_3 +; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10-WAVE64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-WAVE64-NEXT: s_cmov_b64 exec, vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc ; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc -; GFX10-WAVE64-NEXT: .LBB13_3: ; %bb4 ; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-WAVE64-NEXT: .LBB13_3: ; %bb4 ; GFX10-WAVE64-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D ; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB13_5 +; GFX10-WAVE64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-WAVE64-NEXT: s_cmov_b64 exec, vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB13_5 ; GFX10-WAVE64-NEXT: ; %bb.4: ; %bb8 ; GFX10-WAVE64-NEXT: v_mov_b32_e32 v0, 9 ; GFX10-WAVE64-NEXT: global_store_dword v[0:1], v0, off @@ -1468,22 +1480,24 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-WAVE32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-WAVE32-NEXT: v_cmp_nle_f32_e32 vcc_lo, 0, v1 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB13_3 +; GFX10-WAVE32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10-WAVE32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 ; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, vcc_lo ; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo -; GFX10-WAVE32-NEXT: .LBB13_3: ; %bb4 ; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-WAVE32-NEXT: .LBB13_3: ; %bb4 ; GFX10-WAVE32-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB13_5 +; GFX10-WAVE32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB13_5 ; GFX10-WAVE32-NEXT: ; %bb.4: ; %bb8 ; GFX10-WAVE32-NEXT: v_mov_b32_e32 v0, 9 ; GFX10-WAVE32-NEXT: global_store_dword v[0:1], v0, off @@ -1499,25 +1513,26 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_mov_b64 s[0:1], exec ; GFX11-NEXT: s_wqm_b64 exec, exec -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b64 s[2:3], exec -; GFX11-NEXT: v_cmpx_nle_f32_e32 0, v1 -; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX11-NEXT: s_cbranch_execz .LBB13_3 +; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 +; GFX11-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX11-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX11-NEXT: ; %bb.1: ; %bb3 ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc ; GFX11-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX11-NEXT: ; %bb.2: ; %bb3 ; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc -; GFX11-NEXT: .LBB13_3: ; %bb4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11-NEXT: .LBB13_3: ; %bb4 ; GFX11-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D -; GFX11-NEXT: s_mov_b64 s[0:1], exec ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmpx_neq_f32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB13_5 +; GFX11-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX11-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB13_5 ; GFX11-NEXT: ; %bb.4: ; %bb8 ; GFX11-NEXT: v_mov_b32_e32 v0, 9 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc @@ -1554,31 +1569,35 @@ bb9: ; preds = %bb4 define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; SI-LABEL: cbranch_kill: ; SI: ; %bb.0: ; %.entry -; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: image_sample_l v1, v[1:4], s[0:7], s[0:3] dmask:0x1 da ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB14_3 +; SI-NEXT: s_xor_b64 s[0:1], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB14_3 ; SI-NEXT: ; %bb.1: ; %kill -; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; SI-NEXT: s_cbranch_scc0 .LBB14_6 ; SI-NEXT: ; %bb.2: ; %kill ; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: .LBB14_3: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[0:1], s[2:3] +; SI-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; SI-NEXT: s_and_b64 s[4:5], s[0:1], -1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_xor_b64 exec, exec, s[0:1] +; SI-NEXT: s_cmov_b64 exec, s[0:1] +; SI-NEXT: s_cbranch_scc0 .LBB14_5 ; SI-NEXT: ; %bb.4: ; %live ; SI-NEXT: v_mul_f32_e32 v2, v0, v1 -; SI-NEXT: ; %bb.5: ; %export -; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: .LBB14_5: ; %export ; SI-NEXT: exp mrt0 v2, v2, v2, v2 done vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB14_6: @@ -1589,28 +1608,32 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX10-WAVE64-LABEL: cbranch_kill: ; GFX10-WAVE64: ; %bb.0: ; %.entry ; GFX10-WAVE64-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec ; GFX10-WAVE64-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE64-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB14_3 +; GFX10-WAVE64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX10-WAVE64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-WAVE64-NEXT: s_cmov_b64 exec, vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %kill -; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr0 -; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr1 +; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB14_6 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %kill ; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 +; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr0 +; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr1 +; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-WAVE64-NEXT: .LBB14_3: ; %Flow -; GFX10-WAVE64-NEXT: s_or_saveexec_b64 s[0:1], s[2:3] +; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; GFX10-WAVE64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr2 -; GFX10-WAVE64-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX10-WAVE64-NEXT: s_cmov_b64 exec, s[0:1] +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX10-WAVE64-NEXT: ; %bb.4: ; %live ; GFX10-WAVE64-NEXT: v_mul_f32_e32 v2, v0, v1 -; GFX10-WAVE64-NEXT: ; %bb.5: ; %export -; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-WAVE64-NEXT: .LBB14_5: ; %export ; GFX10-WAVE64-NEXT: exp mrt0 v2, v2, v2, v2 done vm ; GFX10-WAVE64-NEXT: s_endpgm ; GFX10-WAVE64-NEXT: .LBB14_6: @@ -1621,28 +1644,32 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX10-WAVE32-LABEL: cbranch_kill: ; GFX10-WAVE32: ; %bb.0: ; %.entry ; GFX10-WAVE32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-WAVE32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-WAVE32-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_ge_f32_e32 vcc_lo, 0, v1 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB14_3 +; GFX10-WAVE32-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX10-WAVE32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %kill -; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, exec_lo -; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr0 -; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr1 +; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB14_6 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %kill ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr0 +; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr1 +; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-WAVE32-NEXT: .LBB14_3: ; %Flow -; GFX10-WAVE32-NEXT: s_or_saveexec_b32 s0, s1 +; GFX10-WAVE32-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX10-WAVE32-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr2 -; GFX10-WAVE32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX10-WAVE32-NEXT: ; %bb.4: ; %live ; GFX10-WAVE32-NEXT: v_mul_f32_e32 v2, v0, v1 -; GFX10-WAVE32-NEXT: ; %bb.5: ; %export -; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-WAVE32-NEXT: .LBB14_5: ; %export ; GFX10-WAVE32-NEXT: exp mrt0 v2, v2, v2, v2 done vm ; GFX10-WAVE32-NEXT: s_endpgm ; GFX10-WAVE32-NEXT: .LBB14_6: @@ -1653,29 +1680,34 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX11-LABEL: cbranch_kill: ; GFX11: ; %bb.0: ; %.entry ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_mov_b64 s[0:1], exec -; GFX11-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmpx_ge_f32_e32 0, v1 -; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX11-NEXT: s_cbranch_execz .LBB14_3 +; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 +; GFX11-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX11-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX11-NEXT: ; %bb.1: ; %kill -; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], exec -; GFX11-NEXT: ; implicit-def: $vgpr0 -; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], exec ; GFX11-NEXT: s_cbranch_scc0 .LBB14_6 ; GFX11-NEXT: ; %bb.2: ; %kill ; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: .LBB14_3: ; %Flow -; GFX11-NEXT: s_or_saveexec_b64 s[0:1], s[2:3] -; GFX11-NEXT: ; implicit-def: $vgpr2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX11-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; GFX11-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: s_cmov_b64 exec, s[0:1] +; GFX11-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX11-NEXT: ; %bb.4: ; %live ; GFX11-NEXT: v_mul_f32_e32 v2, v0, v1 -; GFX11-NEXT: ; %bb.5: ; %export -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11-NEXT: .LBB14_5: ; %export ; GFX11-NEXT: exp mrt0 v2, v2, v2, v2 done ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB14_6: @@ -1714,19 +1746,20 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: ; %latch ; SI-NEXT: ; in Loop: Header=BB15_3 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_add_i32 s6, s6, 1 ; SI-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execz .LBB15_6 +; SI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; SI-NEXT: s_cbranch_scc0 .LBB15_6 ; SI-NEXT: .LBB15_3: ; %hdr ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB15_2 +; SI-NEXT: s_xor_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[8:9], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB15_2 ; SI-NEXT: ; %bb.4: ; %kill ; SI-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec @@ -1734,9 +1767,9 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; SI-NEXT: ; %bb.5: ; %kill ; SI-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_branch .LBB15_2 -; SI-NEXT: .LBB15_6: ; %Flow -; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: .LBB15_6: ; %._crit_edge ; SI-NEXT: exp mrt0 v2, v2, v0, v0 done vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB15_7: @@ -1759,19 +1792,20 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX10-WAVE64-NEXT: s_branch .LBB15_3 ; GFX10-WAVE64-NEXT: .LBB15_2: ; %latch ; GFX10-WAVE64-NEXT: ; in Loop: Header=BB15_3 Depth=1 -; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-WAVE64-NEXT: s_add_i32 s6, s6, 1 ; GFX10-WAVE64-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 ; GFX10-WAVE64-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-WAVE64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB15_6 +; GFX10-WAVE64-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX10-WAVE64-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB15_6 ; GFX10-WAVE64-NEXT: .LBB15_3: ; %hdr ; GFX10-WAVE64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-WAVE64-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB15_2 +; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX10-WAVE64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-WAVE64-NEXT: s_cmov_b64 exec, vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX10-WAVE64-NEXT: ; %bb.4: ; %kill ; GFX10-WAVE64-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec @@ -1779,9 +1813,9 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX10-WAVE64-NEXT: ; %bb.5: ; %kill ; GFX10-WAVE64-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 +; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-WAVE64-NEXT: s_branch .LBB15_2 -; GFX10-WAVE64-NEXT: .LBB15_6: ; %Flow -; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10-WAVE64-NEXT: .LBB15_6: ; %._crit_edge ; GFX10-WAVE64-NEXT: exp mrt0 v2, v2, v0, v0 done vm ; GFX10-WAVE64-NEXT: s_endpgm ; GFX10-WAVE64-NEXT: .LBB15_7: @@ -1804,19 +1838,20 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX10-WAVE32-NEXT: s_branch .LBB15_3 ; GFX10-WAVE32-NEXT: .LBB15_2: ; %latch ; GFX10-WAVE32-NEXT: ; in Loop: Header=BB15_3 Depth=1 -; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-WAVE32-NEXT: s_add_i32 s2, s2, 1 ; GFX10-WAVE32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1 ; GFX10-WAVE32-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WAVE32-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB15_6 +; GFX10-WAVE32-NEXT: s_andn2_b32 s3, exec_lo, s0 +; GFX10-WAVE32-NEXT: s_cselect_b32 exec_lo, s3, s0 +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB15_6 ; GFX10-WAVE32-NEXT: .LBB15_3: ; %hdr ; GFX10-WAVE32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-WAVE32-NEXT: v_cmp_gt_u32_e32 vcc_lo, s2, v0 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10-WAVE32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB15_2 +; GFX10-WAVE32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX10-WAVE32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX10-WAVE32-NEXT: ; %bb.4: ; %kill ; GFX10-WAVE32-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, exec_lo @@ -1824,9 +1859,9 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX10-WAVE32-NEXT: ; %bb.5: ; %kill ; GFX10-WAVE32-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-WAVE32-NEXT: s_branch .LBB15_2 -; GFX10-WAVE32-NEXT: .LBB15_6: ; %Flow -; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-WAVE32-NEXT: .LBB15_6: ; %._crit_edge ; GFX10-WAVE32-NEXT: exp mrt0 v2, v2, v0, v0 done vm ; GFX10-WAVE32-NEXT: s_endpgm ; GFX10-WAVE32-NEXT: .LBB15_7: @@ -1849,20 +1884,22 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX11-NEXT: s_branch .LBB15_3 ; GFX11-NEXT: .LBB15_2: ; %latch ; GFX11-NEXT: ; in Loop: Header=BB15_3 Depth=1 -; GFX11-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11-NEXT: s_add_i32 s6, s6, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execz .LBB15_6 +; GFX11-NEXT: s_and_not1_b64 s[4:5], exec, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX11-NEXT: s_cbranch_scc0 .LBB15_6 ; GFX11-NEXT: .LBB15_3: ; %hdr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_mov_b64 s[4:5], exec -; GFX11-NEXT: v_cmpx_gt_u32_e64 s6, v0 -; GFX11-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 +; GFX11-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX11-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX11-NEXT: ; %bb.4: ; %kill ; GFX11-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], exec @@ -1870,9 +1907,10 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX11-NEXT: ; %bb.5: ; %kill ; GFX11-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11-NEXT: s_branch .LBB15_2 -; GFX11-NEXT: .LBB15_6: ; %Flow -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11-NEXT: .LBB15_6: ; %._crit_edge ; GFX11-NEXT: exp mrt0 v2, v2, v0, v0 done ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB15_7: @@ -1912,40 +1950,60 @@ latch: } define void @skip_mode_switch(i32 %arg) { -; WAVE64-LABEL: skip_mode_switch: -; WAVE64: ; %bb.0: ; %entry -; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; WAVE64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; WAVE64-NEXT: s_cbranch_execz .LBB16_2 -; WAVE64-NEXT: ; %bb.1: ; %bb.0 -; WAVE64-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 -; WAVE64-NEXT: .LBB16_2: ; %bb.1 -; WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] -; WAVE64-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: skip_mode_switch: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: s_mov_b64 s[4:5], exec +; SI-NEXT: s_and_b64 s[6:7], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB16_2 +; SI-NEXT: ; %bb.1: ; %bb.0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: .LBB16_2: ; %bb.1 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-WAVE64-LABEL: skip_mode_switch: +; GFX10-WAVE64: ; %bb.0: ; %entry +; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX10-WAVE64-NEXT: s_mov_b64 s[4:5], exec +; GFX10-WAVE64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX10-WAVE64-NEXT: s_cmov_b64 exec, vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB16_2 +; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb.0 +; GFX10-WAVE64-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 +; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-WAVE64-NEXT: .LBB16_2: ; %bb.1 +; GFX10-WAVE64-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-WAVE32-LABEL: skip_mode_switch: ; GFX10-WAVE32: ; %bb.0: ; %entry ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB16_2 +; GFX10-WAVE32-NEXT: s_mov_b32 s4, exec_lo +; GFX10-WAVE32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb.0 ; GFX10-WAVE32-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 -; GFX10-WAVE32-NEXT: .LBB16_2: ; %bb.1 ; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-WAVE32-NEXT: .LBB16_2: ; %bb.1 ; GFX10-WAVE32-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: skip_mode_switch: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11-NEXT: s_mov_b64 s[0:1], exec -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX11-NEXT: ; %bb.1: ; %bb.0 ; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 -; GFX11-NEXT: .LBB16_2: ; %bb.1 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11-NEXT: .LBB16_2: ; %bb.1 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp = icmp eq i32 %arg, 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index b9ad4615fcbcf1..bf0ba74e6e29e9 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10069,36 +10069,23 @@ entry: define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: test_limited_sgpr: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s42, -1 -; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX6-NEXT: s_add_u32 s40, s40, s9 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 -; GFX6-NEXT: s_addc_u32 s41, s41, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: s_mov_b64 s[4:5], exec ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_mov_b32_e32 v6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_writelane_b32 v1, s0, 0 -; GFX6-NEXT: v_writelane_b32 v1, s1, 1 -; GFX6-NEXT: v_writelane_b32 v1, s2, 2 -; GFX6-NEXT: v_writelane_b32 v1, s3, 3 -; GFX6-NEXT: s_mov_b32 s8, 0x80400 -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s8 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[4:5] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0 ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240 -; GFX6-NEXT: s_mov_b32 s2, 0x84400 +; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_mov_b32 s42, -1 +; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX6-NEXT: s_add_u32 s40, s40, s9 +; GFX6-NEXT: s_addc_u32 s41, s41, 0 +; GFX6-NEXT: s_mov_b32 s2, 0x83400 ; GFX6-NEXT: s_mov_b64 s[8:9], exec ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10108,7 +10095,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224 -; GFX6-NEXT: s_mov_b32 s2, 0x84000 +; GFX6-NEXT: s_mov_b32 s2, 0x83000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10117,7 +10104,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208 -; GFX6-NEXT: s_mov_b32 s2, 0x83c00 +; GFX6-NEXT: s_mov_b32 s2, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10126,7 +10113,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192 -; GFX6-NEXT: s_mov_b32 s2, 0x83800 +; GFX6-NEXT: s_mov_b32 s2, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10135,7 +10122,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176 -; GFX6-NEXT: s_mov_b32 s2, 0x83400 +; GFX6-NEXT: s_mov_b32 s2, 0x82400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10144,7 +10131,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160 -; GFX6-NEXT: s_mov_b32 s2, 0x83000 +; GFX6-NEXT: s_mov_b32 s2, 0x82000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10153,7 +10140,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144 -; GFX6-NEXT: s_mov_b32 s2, 0x82c00 +; GFX6-NEXT: s_mov_b32 s2, 0x81c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10162,7 +10149,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128 -; GFX6-NEXT: s_mov_b32 s2, 0x82800 +; GFX6-NEXT: s_mov_b32 s2, 0x81800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10171,7 +10158,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112 -; GFX6-NEXT: s_mov_b32 s2, 0x82400 +; GFX6-NEXT: s_mov_b32 s2, 0x81400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10180,7 +10167,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96 -; GFX6-NEXT: s_mov_b32 s2, 0x82000 +; GFX6-NEXT: s_mov_b32 s2, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10189,41 +10176,25 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80 -; GFX6-NEXT: s_mov_b32 s2, 0x81c00 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64 -; GFX6-NEXT: s_mov_b32 s2, 0x81400 +; GFX6-NEXT: s_mov_b32 s2, 0x80c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_load_dwordx4 v[16:19], v[5:6], s[4:7], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 ; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16 -; GFX6-NEXT: s_mov_b32 s2, 0x80c00 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32 -; GFX6-NEXT: s_mov_b32 s2, 0x81000 +; GFX6-NEXT: s_mov_b32 s2, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:32 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 @@ -10232,25 +10203,17 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s1, 1 ; GFX6-NEXT: v_writelane_b32 v4, s2, 2 ; GFX6-NEXT: v_writelane_b32 v4, s3, 3 -; GFX6-NEXT: s_mov_b32 s10, 0x80800 +; GFX6-NEXT: s_mov_b32 s10, 0x80400 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[8:9] -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48 -; GFX6-NEXT: s_mov_b32 s0, 0x81800 +; GFX6-NEXT: buffer_load_dwordx4 v[20:23], v[5:6], s[4:7], 0 addr64 offset:48 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(3) ; GFX6-NEXT: v_mov_b32_e32 v7, 1 -; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: buffer_store_dword v7, v4, s[40:43], 0 offen ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[4:11] @@ -10266,13 +10229,15 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s9, 5 ; GFX6-NEXT: v_writelane_b32 v4, s10, 6 ; GFX6-NEXT: v_writelane_b32 v4, s11, 7 -; GFX6-NEXT: s_mov_b32 s2, 0x84800 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s12, 0x83800 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 exec, s[2:3] ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: s_and_b64 s[36:37], vcc, -1 ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[8:15] ; GFX6-NEXT: ;;#ASMEND @@ -10283,19 +10248,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ; def s[24:31] ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[0:3] +; GFX6-NEXT: ; def s[4:7] ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[4:5] +; GFX6-NEXT: ; def s[34:35] ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s33 ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX6-NEXT: s_mov_b64 vcc, s[6:7] -; GFX6-NEXT: s_cbranch_execz .LBB1_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; %bb0 -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10307,18 +10271,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s13, 5 ; GFX6-NEXT: v_writelane_b32 v4, s14, 6 ; GFX6-NEXT: v_writelane_b32 v4, s15, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x85000 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s38, 0x84000 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x84800 +; GFX6-NEXT: s_mov_b32 s38, 0x83800 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s8, v4, 0 ; GFX6-NEXT: v_readlane_b32 s9, v4, 1 @@ -10330,8 +10294,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s15, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10343,18 +10307,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s21, 5 ; GFX6-NEXT: v_writelane_b32 v4, s22, 6 ; GFX6-NEXT: v_writelane_b32 v4, s23, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x85800 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s38, 0x84800 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x85000 +; GFX6-NEXT: s_mov_b32 s38, 0x84000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s16, v4, 0 ; GFX6-NEXT: v_readlane_b32 s17, v4, 1 @@ -10366,8 +10330,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s23, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10379,18 +10343,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s29, 5 ; GFX6-NEXT: v_writelane_b32 v4, s30, 6 ; GFX6-NEXT: v_writelane_b32 v4, s31, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x86000 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s38, 0x85000 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x85800 +; GFX6-NEXT: s_mov_b32 s38, 0x84800 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s24, v4, 0 ; GFX6-NEXT: v_readlane_b32 s25, v4, 1 @@ -10402,8 +10366,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s31, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10411,30 +10375,33 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s1, 1 ; GFX6-NEXT: v_writelane_b32 v4, s2, 2 ; GFX6-NEXT: v_writelane_b32 v4, s3, 3 -; GFX6-NEXT: s_mov_b32 s34, 0x86800 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s38, 0x85c00 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: s_mov_b64 exec, 3 +; GFX6-NEXT: s_mov_b64 vcc, s[2:3] +; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_writelane_b32 v4, s4, 0 ; GFX6-NEXT: v_writelane_b32 v4, s5, 1 -; GFX6-NEXT: s_mov_b32 s2, 0x86c00 +; GFX6-NEXT: v_writelane_b32 v4, s6, 2 +; GFX6-NEXT: v_writelane_b32 v4, s7, 3 +; GFX6-NEXT: s_mov_b32 s2, 0x85800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] -; GFX6-NEXT: s_mov_b64 s[34:35], exec +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s36, 0x86000 +; GFX6-NEXT: s_mov_b32 s38, 0x85000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s0, v4, 0 ; GFX6-NEXT: v_readlane_b32 s1, v4, 1 @@ -10446,13 +10413,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s7, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[34:35] -; GFX6-NEXT: s_mov_b64 s[34:35], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s44, 0x86800 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x2160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s36, v4, 0 ; GFX6-NEXT: v_readlane_b32 s37, v4, 1 @@ -10460,24 +10427,68 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s39, v4, 3 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[34:35] -; GFX6-NEXT: s_mov_b64 s[44:45], exec -; GFX6-NEXT: s_mov_b64 exec, 3 +; GFX6-NEXT: s_mov_b64 exec, s[44:45] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35] +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x21b0 +; GFX6-NEXT: s_mov_b32 s6, 0x85c00 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s34, v4, 0 -; GFX6-NEXT: v_readlane_b32 s35, v4, 1 +; GFX6-NEXT: v_readlane_b32 s0, v4, 0 +; GFX6-NEXT: v_readlane_b32 s1, v4, 1 +; GFX6-NEXT: v_readlane_b32 s2, v4, 2 +; GFX6-NEXT: v_readlane_b32 s3, v4, 3 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35] -; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b64 exec, s[4:5] +; GFX6-NEXT: s_mov_b32 s2, 0x83800 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s2, 0x84000 +; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s2, 0x84800 +; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v19, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: v_mov_b32_e32 v0, v20 +; GFX6-NEXT: v_mov_b32_e32 v1, v21 +; GFX6-NEXT: v_mov_b32_e32 v2, v22 +; GFX6-NEXT: v_mov_b32_e32 v3, v23 +; GFX6-NEXT: s_waitcnt expcnt(3) ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s2 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s2, 0x84000 +; GFX6-NEXT: v_mov_b32_e32 v23, v3 +; GFX6-NEXT: buffer_load_dword v12, off, s[40:43], s2 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s2, 0x83800 +; GFX6-NEXT: v_mov_b32_e32 v22, v2 +; GFX6-NEXT: v_mov_b32_e32 v21, v1 +; GFX6-NEXT: v_mov_b32_e32 v20, v0 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART @@ -10490,158 +10501,122 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: .LBB1_2: ; %ret ; GFX6-NEXT: s_or_b64 exec, exec, vcc -; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: .LBB1_2: ; %ret +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s6, 0x80400 +; GFX6-NEXT: s_mov_b32 s8, 0x80400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s8 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s0, v4, 0 -; GFX6-NEXT: v_readlane_b32 s1, v4, 1 -; GFX6-NEXT: v_readlane_b32 s2, v4, 2 -; GFX6-NEXT: v_readlane_b32 s3, v4, 3 +; GFX6-NEXT: v_readlane_b32 s4, v4, 0 +; GFX6-NEXT: v_readlane_b32 s5, v4, 1 +; GFX6-NEXT: v_readlane_b32 s6, v4, 2 +; GFX6-NEXT: v_readlane_b32 s7, v4, 3 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b64 s[36:37], s[0:1] -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s6, 0x80800 +; GFX6-NEXT: s_mov_b64 exec, s[2:3] +; GFX6-NEXT: s_mov_b32 s4, 0x83400 +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX6-NEXT: s_mov_b32 s4, 0x83000 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s4, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s0, v4, 0 -; GFX6-NEXT: v_readlane_b32 s1, v4, 1 -; GFX6-NEXT: v_readlane_b32 s2, v4, 2 -; GFX6-NEXT: v_readlane_b32 s3, v4, 3 -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:224 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s4, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b32 s0, 0x84400 -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX6-NEXT: s_mov_b32 s0, 0x84000 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x83c00 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x83800 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x83400 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x83000 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x82c00 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x82800 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x82400 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x82000 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x81c00 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x81400 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x81800 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x81000 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:208 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s4, 0x82400 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:192 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s4, 0x82000 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:176 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s4, 0x81c00 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:160 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s4, 0x81800 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:144 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s4, 0x81400 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:128 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s4, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x80c00 +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s4, 0x80c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s4, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], v[5:6], s[0:3], 0 addr64 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[20:23], v[5:6], s[0:3], 0 addr64 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], v[5:6], s[0:3], 0 addr64 offset:32 +; GFX6-NEXT: s_waitcnt expcnt(3) +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX9-FLATSCR-LABEL: test_limited_sgpr: @@ -10656,6 +10631,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[34:35], exec ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224 @@ -10686,8 +10662,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) -; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v7, 13, v4 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v7, 13, v4 +; GFX9-FLATSCR-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-FLATSCR-NEXT: scratch_store_dword v4, v6, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill @@ -10732,8 +10709,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; def s33 ; GFX9-FLATSCR-NEXT: ;;#ASMEND -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[34:35], vcc -; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-FLATSCR-NEXT: s_cmov_b64 exec, vcc +; GFX9-FLATSCR-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; %bb0 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39] @@ -10772,8 +10749,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND -; GFX9-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 @@ -10859,7 +10836,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[39:42], v5, s[38:39] offset:16 ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-FLATSCR-NEXT: v_lshl_add_u32 v4, v0, 13, 16 +; GFX10-FLATSCR-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX10-FLATSCR-NEXT: scratch_store_dword v4, v6, off ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ; def s[0:7] @@ -10882,8 +10861,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ; def s38 ; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX10-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 +; GFX10-FLATSCR-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-FLATSCR-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10-FLATSCR-NEXT: ; %bb.1: ; %bb0 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] @@ -11017,8 +10996,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX10-FLATSCR-NEXT: s_or_b32 exec_lo, exec_lo, s33 +; GFX10-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[63:66], s[36:37] offset:112 ; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[59:62], s[36:37] offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 8498e9af46f2b5..89fdad6b8f9a20 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -343,40 +343,43 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v4 ; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v13 +; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[8:9], -1 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] +; GCN-IR-NEXT: s_mov_b64 s[6:7], exec ; GCN-IR-NEXT: v_mov_b32_e32 v15, v14 -; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[8:9] +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: s_xor_b64 s[8:9], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc @@ -394,34 +397,34 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v16, v10 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v11, vcc +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 ; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 -; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: .LBB1_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v4, v2, v7 ; GCN-IR-NEXT: v_mul_hi_u32 v5, v2, v6 ; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 @@ -1615,21 +1618,24 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB11_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc @@ -1645,34 +1651,34 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB11_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB11_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB11_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 ; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 @@ -1807,22 +1813,25 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 +; GCN-IR-NEXT: s_and_b64 s[10:11], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc @@ -1838,34 +1847,34 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB12_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB12_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB12_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 ; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 @@ -1908,26 +1917,29 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec +; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 ; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB13_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB13_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB13_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v6 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v10 @@ -1952,23 +1964,23 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB13_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB13_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB13_6: ; %udiv-end ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[4:5], 15 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index 132775d81ca1ad..ba100154668d6c 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -31,8 +31,9 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39] ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; MUBUF-NEXT: s_and_saveexec_b32 s0, vcc_lo -; MUBUF-NEXT: s_cbranch_execz .LBB0_2 +; MUBUF-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; MUBUF-NEXT: s_cmov_b32 exec_lo, vcc_lo +; MUBUF-NEXT: s_cbranch_scc0 .LBB0_2 ; MUBUF-NEXT: ; %bb.1: ; %if.then4.i ; MUBUF-NEXT: v_add_nc_u32_e64 v0, 4, 0x4000 ; MUBUF-NEXT: s_mov_b32 s0, 0x41c64e6d @@ -65,8 +66,9 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; FLATSCR-NEXT: v_mov_b32_e32 v0, s2 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; FLATSCR-NEXT: s_and_saveexec_b32 s0, vcc_lo -; FLATSCR-NEXT: s_cbranch_execz .LBB0_2 +; FLATSCR-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; FLATSCR-NEXT: s_cmov_b32 exec_lo, vcc_lo +; FLATSCR-NEXT: s_cbranch_scc0 .LBB0_2 ; FLATSCR-NEXT: ; %bb.1: ; %if.then4.i ; FLATSCR-NEXT: s_movk_i32 s0, 0x4000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:4 @@ -92,9 +94,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; MUBUF11-NEXT: s_waitcnt lgkmcnt(0) ; MUBUF11-NEXT: v_mov_b32_e32 v0, s2 ; MUBUF11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; MUBUF11-NEXT: s_mov_b32 s0, exec_lo -; MUBUF11-NEXT: v_cmpx_ne_u32_e32 0, v0 -; MUBUF11-NEXT: s_cbranch_execz .LBB0_2 +; MUBUF11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; MUBUF11-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; MUBUF11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; MUBUF11-NEXT: s_cbranch_scc0 .LBB0_2 ; MUBUF11-NEXT: ; %bb.1: ; %if.then4.i ; MUBUF11-NEXT: s_movk_i32 s0, 0x4000 ; MUBUF11-NEXT: scratch_load_b64 v[0:1], off, s0 offset:4 @@ -119,9 +122,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; FLATSCR11-NEXT: s_waitcnt lgkmcnt(0) ; FLATSCR11-NEXT: v_mov_b32_e32 v0, s2 ; FLATSCR11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; FLATSCR11-NEXT: s_mov_b32 s0, exec_lo -; FLATSCR11-NEXT: v_cmpx_ne_u32_e32 0, v0 -; FLATSCR11-NEXT: s_cbranch_execz .LBB0_2 +; FLATSCR11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; FLATSCR11-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; FLATSCR11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; FLATSCR11-NEXT: s_cbranch_scc0 .LBB0_2 ; FLATSCR11-NEXT: ; %bb.1: ; %if.then4.i ; FLATSCR11-NEXT: s_movk_i32 s0, 0x4000 ; FLATSCR11-NEXT: scratch_load_b64 v[0:1], off, s0 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index 3c16cd29de8f6a..495885fa4fc53e 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -205,15 +205,17 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; WAVE32-OPT-NEXT: v_and_b32_e32 v0, 1, v0 ; WAVE32-OPT-NEXT: s_mov_b32 s4, exec_lo -; WAVE32-OPT-NEXT: v_cmpx_eq_u32_e32 1, v0 -; WAVE32-OPT-NEXT: s_cbranch_execz .LBB4_2 +; WAVE32-OPT-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; WAVE32-OPT-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; WAVE32-OPT-NEXT: s_cmov_b32 exec_lo, vcc_lo +; WAVE32-OPT-NEXT: s_cbranch_scc0 .LBB4_2 ; WAVE32-OPT-NEXT: ; %bb.1: ; %bb1 ; WAVE32-OPT-NEXT: s_lshr_b32 s5, s32, 5 ; WAVE32-OPT-NEXT: ;;#ASMSTART ; WAVE32-OPT-NEXT: ; use s5 ; WAVE32-OPT-NEXT: ;;#ASMEND -; WAVE32-OPT-NEXT: .LBB4_2: ; %bb2 ; WAVE32-OPT-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; WAVE32-OPT-NEXT: .LBB4_2: ; %bb2 ; WAVE32-OPT-NEXT: s_setpc_b64 s[30:31] ; ; WAVE64-OPT-LABEL: func_stacksave_nonentry_block: @@ -221,15 +223,17 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; WAVE64-OPT-NEXT: v_and_b32_e32 v0, 1, v0 ; WAVE64-OPT-NEXT: s_mov_b64 s[4:5], exec -; WAVE64-OPT-NEXT: v_cmpx_eq_u32_e32 1, v0 -; WAVE64-OPT-NEXT: s_cbranch_execz .LBB4_2 +; WAVE64-OPT-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; WAVE64-OPT-NEXT: s_cmp_lg_u64 vcc, 0 +; WAVE64-OPT-NEXT: s_cmov_b64 exec, vcc +; WAVE64-OPT-NEXT: s_cbranch_scc0 .LBB4_2 ; WAVE64-OPT-NEXT: ; %bb.1: ; %bb1 ; WAVE64-OPT-NEXT: s_lshr_b32 s6, s32, 6 ; WAVE64-OPT-NEXT: ;;#ASMSTART ; WAVE64-OPT-NEXT: ; use s6 ; WAVE64-OPT-NEXT: ;;#ASMEND -; WAVE64-OPT-NEXT: .LBB4_2: ; %bb2 ; WAVE64-OPT-NEXT: s_or_b64 exec, exec, s[4:5] +; WAVE64-OPT-NEXT: .LBB4_2: ; %bb2 ; WAVE64-OPT-NEXT: s_setpc_b64 s[30:31] ; ; WAVE32-O0-LABEL: func_stacksave_nonentry_block: @@ -244,29 +248,33 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7 ; WAVE32-O0-NEXT: v_and_b32_e64 v1, 1, v1 -; WAVE32-O0-NEXT: v_cmp_eq_u32_e64 s5, v1, 1 -; WAVE32-O0-NEXT: s_mov_b32 s4, exec_lo +; WAVE32-O0-NEXT: v_cmp_eq_u32_e64 s4, v1, 1 +; WAVE32-O0-NEXT: s_mov_b32 s5, exec_lo ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) -; WAVE32-O0-NEXT: v_writelane_b32 v0, s4, 0 +; WAVE32-O0-NEXT: v_writelane_b32 v0, s5, 0 ; WAVE32-O0-NEXT: s_or_saveexec_b32 s7, -1 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7 -; WAVE32-O0-NEXT: s_and_b32 s4, s4, s5 -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-O0-NEXT: s_cbranch_execz .LBB4_2 -; WAVE32-O0-NEXT: ; %bb.1: ; %bb1 -; WAVE32-O0-NEXT: s_mov_b32 s4, s32 -; WAVE32-O0-NEXT: s_lshr_b32 s4, s4, 5 -; WAVE32-O0-NEXT: ;;#ASMSTART -; WAVE32-O0-NEXT: ; use s4 -; WAVE32-O0-NEXT: ;;#ASMEND -; WAVE32-O0-NEXT: .LBB4_2: ; %bb2 +; WAVE32-O0-NEXT: s_cmp_lg_u32 s4, 0 +; WAVE32-O0-NEXT: s_cmov_b32 exec_lo, s4 +; WAVE32-O0-NEXT: s_cbranch_scc1 .LBB4_1 +; WAVE32-O0-NEXT: s_branch .LBB4_2 +; WAVE32-O0-NEXT: .LBB4_1: ; %bb1 ; WAVE32-O0-NEXT: s_or_saveexec_b32 s7, -1 ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: v_readlane_b32 s4, v0, 0 +; WAVE32-O0-NEXT: s_mov_b32 s5, s32 +; WAVE32-O0-NEXT: s_lshr_b32 s5, s5, 5 +; WAVE32-O0-NEXT: ;;#ASMSTART +; WAVE32-O0-NEXT: ; use s5 +; WAVE32-O0-NEXT: ;;#ASMEND ; WAVE32-O0-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; WAVE32-O0-NEXT: .LBB4_2: ; %bb2 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s7, -1 +; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7 ; WAVE32-O0-NEXT: ; kill: killed $vgpr0 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -286,31 +294,35 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[10:11] ; WAVE64-O0-NEXT: v_and_b32_e64 v1, 1, v1 -; WAVE64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, 1 -; WAVE64-O0-NEXT: s_mov_b64 s[4:5], exec +; WAVE64-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, 1 +; WAVE64-O0-NEXT: s_mov_b64 s[6:7], exec ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) -; WAVE64-O0-NEXT: v_writelane_b32 v0, s4, 0 -; WAVE64-O0-NEXT: v_writelane_b32 v0, s5, 1 +; WAVE64-O0-NEXT: v_writelane_b32 v0, s6, 0 +; WAVE64-O0-NEXT: v_writelane_b32 v0, s7, 1 ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 ; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; WAVE64-O0-NEXT: s_mov_b64 exec, s[10:11] -; WAVE64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] -; WAVE64-O0-NEXT: s_cbranch_execz .LBB4_2 -; WAVE64-O0-NEXT: ; %bb.1: ; %bb1 -; WAVE64-O0-NEXT: s_mov_b32 s4, s32 -; WAVE64-O0-NEXT: s_lshr_b32 s4, s4, 6 -; WAVE64-O0-NEXT: ;;#ASMSTART -; WAVE64-O0-NEXT: ; use s4 -; WAVE64-O0-NEXT: ;;#ASMEND -; WAVE64-O0-NEXT: .LBB4_2: ; %bb2 +; WAVE64-O0-NEXT: s_cmp_lg_u64 s[4:5], 0 +; WAVE64-O0-NEXT: s_cmov_b64 exec, s[4:5] +; WAVE64-O0-NEXT: s_cbranch_scc1 .LBB4_1 +; WAVE64-O0-NEXT: s_branch .LBB4_2 +; WAVE64-O0-NEXT: .LBB4_1: ; %bb1 ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[10:11] ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE64-O0-NEXT: v_readlane_b32 s4, v0, 0 ; WAVE64-O0-NEXT: v_readlane_b32 s5, v0, 1 +; WAVE64-O0-NEXT: s_mov_b32 s6, s32 +; WAVE64-O0-NEXT: s_lshr_b32 s6, s6, 6 +; WAVE64-O0-NEXT: ;;#ASMSTART +; WAVE64-O0-NEXT: ; use s6 +; WAVE64-O0-NEXT: ;;#ASMEND ; WAVE64-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; WAVE64-O0-NEXT: .LBB4_2: ; %bb2 +; WAVE64-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; WAVE64-O0-NEXT: s_mov_b64 exec, s[10:11] ; WAVE64-O0-NEXT: ; kill: killed $vgpr0 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -326,21 +338,22 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: v_and_b32_e64 v0, 1, v0 -; WAVE32-WWM-PREALLOC-NEXT: v_cmp_eq_u32_e64 s5, v0, 1 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s4, exec_lo -; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v1, s4, 0 -; WAVE32-WWM-PREALLOC-NEXT: s_and_b32 s4, s4, s5 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-WWM-PREALLOC-NEXT: s_cbranch_execz .LBB4_2 -; WAVE32-WWM-PREALLOC-NEXT: ; %bb.1: ; %bb1 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s4, s32 -; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s4, s4, 5 +; WAVE32-WWM-PREALLOC-NEXT: v_cmp_eq_u32_e64 s4, v0, 1 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s5, exec_lo +; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v1, s5, 0 +; WAVE32-WWM-PREALLOC-NEXT: s_cmp_lg_u32 s4, 0 +; WAVE32-WWM-PREALLOC-NEXT: s_cmov_b32 exec_lo, s4 +; WAVE32-WWM-PREALLOC-NEXT: s_cbranch_scc1 .LBB4_1 +; WAVE32-WWM-PREALLOC-NEXT: s_branch .LBB4_2 +; WAVE32-WWM-PREALLOC-NEXT: .LBB4_1: ; %bb1 +; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v1, 0 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s5, s32 +; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s5, s5, 5 ; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMSTART -; WAVE32-WWM-PREALLOC-NEXT: ; use s4 +; WAVE32-WWM-PREALLOC-NEXT: ; use s5 ; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMEND -; WAVE32-WWM-PREALLOC-NEXT: .LBB4_2: ; %bb2 -; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v1, 0 ; WAVE32-WWM-PREALLOC-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; WAVE32-WWM-PREALLOC-NEXT: .LBB4_2: ; %bb2 ; WAVE32-WWM-PREALLOC-NEXT: ; kill: killed $vgpr1 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir b/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir index 4bb0046c0ee01a..16c439a51033d6 100644 --- a/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir +++ b/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir @@ -18,10 +18,10 @@ body: | bb.1: %2:vgpr_32 = V_MAC_F32_e32 0, %0, %1, implicit $mode, implicit $exec %3:vgpr_32 = V_MED3_F32_e64 0, %1, 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec + SI_WAVE_RECONVERGE %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: %4:vgpr_32 = PHI %5, %bb.3, %3, %bb.1 - SI_END_CF %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec EXP_DONE 0, %4, %4, %4, %4, -1, 0, 15, implicit $exec S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir b/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir index c23c8900096fba..19e012413630ac 100644 --- a/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir +++ b/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir @@ -26,7 +26,6 @@ body: | ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[PHI]], [[COPY2]], 0, implicit $exec @@ -40,6 +39,7 @@ body: | ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET [[PHI1]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 bb.1: liveins: $vgpr0 @@ -56,7 +56,6 @@ body: | S_BRANCH %bb.4 bb.3: - SI_END_CF %8:sreg_64_xexec, implicit-def $exec, implicit-def $scc, implicit $exec %13:sreg_32 = S_MOV_B32 1 %15:vgpr_32 = COPY %13:sreg_32 %10:vgpr_32, dead %20:sreg_64_xexec = V_ADD_CO_U32_e64 %6:vgpr_32, %15:vgpr_32, 0, implicit $exec @@ -68,6 +67,7 @@ body: | %18:sreg_64 = REG_SEQUENCE %16:sreg_32, %subreg.sub0, %17:sreg_32, %subreg.sub1 %19:sgpr_128 = REG_SEQUENCE %12:sreg_64, %subreg.sub0_sub1, %18:sreg_64, %subreg.sub2_sub3 BUFFER_STORE_DWORD_OFFSET %11:vgpr_32, %19:sgpr_128, 0, 0, 0, 0, implicit $exec + SI_WAVE_RECONVERGE %8:sreg_64_xexec, implicit-def $exec, implicit-def $scc, implicit $exec S_BRANCH %bb.2 ... diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index d4329aec2021c0..7a97ca230c93ca 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -12,22 +12,25 @@ define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) # ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 +; CHECK-NEXT: s_mov_b64 s[6:7], exec ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_and_b64 s[8:9], vcc, -1 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: v_mov_b32_e32 v3, s7 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %ift ; CHECK-NEXT: s_mov_b32 s4, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: v_mov_b32_e32 v3, s7 -; CHECK-NEXT: ; %bb.2: ; %ife ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB0_2: ; %ife ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; CHECK-NEXT: s_mov_b32 s3, 0xf000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll index ecebbb9ac874f8..58af396715bae5 100644 --- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -100,67 +100,66 @@ else: ; preds = %else.if.cond define amdgpu_ps { <4 x float> } @test_return_to_epilog_with_optimized_kill(float %val) #0 { ; GCN-LABEL: name: test_return_to_epilog_with_optimized_kill ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN-NEXT: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec - ; GCN-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_XOR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def dead $scc - ; GCN-NEXT: S_CBRANCH_EXECNZ %bb.3, implicit $exec + ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_XOR_B64 renamable $vcc, $exec, implicit-def $scc + ; GCN-NEXT: S_CMP_LG_U64 renamable $vcc, 0, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64 killed renamable $vcc, implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.4, implicit killed $scc ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1.Flow1: - ; GCN-NEXT: successors: %bb.6(0x40000000), %bb.2(0x40000000) - ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECNZ %bb.6, implicit $exec - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2.end: - ; GCN-NEXT: successors: %bb.9(0x80000000) - ; GCN-NEXT: liveins: $sgpr2_sgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc - ; GCN-NEXT: S_BRANCH %bb.9 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3.flow.preheader: - ; GCN-NEXT: successors: %bb.4(0x80000000) + ; GCN-NEXT: bb.1.flow.preheader: + ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.4.flow: - ; GCN-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) + ; GCN-NEXT: bb.2.flow: + ; GCN-NEXT: successors: %bb.3(0x04000000), %bb.2(0x7c000000) ; GCN-NEXT: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GCN-NEXT: $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc - ; GCN-NEXT: S_CBRANCH_EXECNZ %bb.4, implicit $exec + ; GCN-NEXT: renamable $sgpr6_sgpr7 = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc + ; GCN-NEXT: $exec = S_CSELECT_B64 killed renamable $sgpr6_sgpr7, renamable $sgpr4_sgpr5, implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.5.Flow: - ; GCN-NEXT: successors: %bb.6(0x40000000), %bb.2(0x40000000) - ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; GCN-NEXT: bb.3.Flow: + ; GCN-NEXT: successors: %bb.4(0x80000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GCN-NEXT: $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GCN-NEXT: $exec = S_OR_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.6.kill0: - ; GCN-NEXT: successors: %bb.7(0x40000000), %bb.8(0x40000000) + ; GCN-NEXT: bb.4.Flow1: + ; GCN-NEXT: successors: %bb.5(0x40000000), %bb.7(0x40000000) ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 renamable $sgpr2_sgpr3, $exec, implicit-def $scc + ; GCN-NEXT: S_CMP_LG_U64 renamable $sgpr2_sgpr3, 0, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr2_sgpr3, implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.7, implicit killed $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.5.kill0: + ; GCN-NEXT: successors: %bb.6(0x40000000), %bb.8(0x40000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} ; GCN-NEXT: dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc ; GCN-NEXT: S_CBRANCH_SCC0 %bb.8, implicit $scc ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.7.kill0: - ; GCN-NEXT: successors: %bb.9(0x80000000) - ; GCN-NEXT: liveins: $sgpr2_sgpr3, $scc + ; GCN-NEXT: bb.6.kill0: + ; GCN-NEXT: successors: %bb.7(0x80000000) + ; GCN-NEXT: liveins: $sgpr4_sgpr5, $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: $exec = S_MOV_B64 0 - ; GCN-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; GCN-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.7.end: + ; GCN-NEXT: successors: %bb.9(0x80000000) + ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_BRANCH %bb.9 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.8: diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 52370f6a2ef054..f768f2a672614a 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -371,11 +371,11 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1 ; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v0, off, off offset:4 ; 4-byte Folded Reload ; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6 -; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) -; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s0, v0, 0 -; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s1, v0, 1 ; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v1, off, off offset:8 ; 4-byte Folded Reload ; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v2, off, off ; 4-byte Folded Reload +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(2) +; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s0, v0, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s1, v0, 1 ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) ; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v1, v2, s[0:1] dlc ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 837b46f0ce578d..ec0a3497f78bf8 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -93,22 +93,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v3 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v2 -; GLOBALNESS1-NEXT: s_branch .LBB1_4 +; GLOBALNESS1-NEXT: s_branch .LBB1_5 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 -; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow14 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] +; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow15 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: .LBB1_4: ; %Flow28 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_30 -; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 +; GLOBALNESS1-NEXT: .LBB1_5: ; %bb5 ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0x80 @@ -133,52 +135,54 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[44:45] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 -; GLOBALNESS1-NEXT: ; %bb.5: ; %NodeBlock -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_10 +; GLOBALNESS1-NEXT: ; %bb.6: ; %NodeBlock +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_cmp_lt_i32 s75, 1 -; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7 -; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock12 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_8 +; GLOBALNESS1-NEXT: ; %bb.7: ; %LeafBlock12 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_cmp_lg_u32 s75, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_8 -; GLOBALNESS1-NEXT: s_branch .LBB1_9 -; GLOBALNESS1-NEXT: .LBB1_7: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_9 +; GLOBALNESS1-NEXT: s_branch .LBB1_10 +; GLOBALNESS1-NEXT: .LBB1_8: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS1-NEXT: .LBB1_8: ; %LeafBlock -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: .LBB1_9: ; %LeafBlock +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_cmp_lg_u32 s75, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow25 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: .LBB1_10: ; %Flow25 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 -; GLOBALNESS1-NEXT: ; %bb.10: ; %baz.exit.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; %bb.11: ; %baz.exit.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: flat_load_dword v0, v[2:3] +; GLOBALNESS1-NEXT: s_mov_b64 s[72:73], exec ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[60:61], 0, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[72:73], s[60:61] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 -; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cmp_lg_u64 s[60:61], 0 +; GLOBALNESS1-NEXT: s_cmov_b64 exec, s[60:61] +; GLOBALNESS1-NEXT: s_cbranch_scc0 .LBB1_26 +; GLOBALNESS1-NEXT: ; %bb.12: ; %bb33.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 -; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 +; GLOBALNESS1-NEXT: ; %bb.13: ; %bb39.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off -; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: .LBB1_14: ; %bb44.lr.ph.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) @@ -187,15 +191,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[62:63], 0, v2 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 ; GLOBALNESS1-NEXT: s_branch .LBB1_16 -; GLOBALNESS1-NEXT: .LBB1_14: ; %Flow16 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i -; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; Parent Loop BB1_5 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[46:47] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 @@ -245,37 +246,43 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], exec +; GLOBALNESS1-NEXT: s_and_b64 s[6:7], s[62:63], exec +; GLOBALNESS1-NEXT: s_cmov_b64 exec, s[6:7] +; GLOBALNESS1-NEXT: s_cbranch_scc0 .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[42:43], off -; GLOBALNESS1-NEXT: s_branch .LBB1_14 -; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] +; GLOBALNESS1-NEXT: s_branch .LBB1_15 +; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GLOBALNESS1-NEXT: s_branch .LBB1_3 +; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow23 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow24 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[72:73] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[60:61] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 +; GLOBALNESS1-NEXT: .LBB1_26: ; %bb64.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 +; GLOBALNESS1-NEXT: s_and_b64 s[6:7], s[60:61], exec +; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GLOBALNESS1-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GLOBALNESS1-NEXT: s_cmov_b64 exec, s[6:7] +; GLOBALNESS1-NEXT: s_cbranch_scc0 .LBB1_3 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 ; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off @@ -380,22 +387,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v3 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v2 -; GLOBALNESS0-NEXT: s_branch .LBB1_4 +; GLOBALNESS0-NEXT: s_branch .LBB1_5 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 -; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow14 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] +; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow15 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: .LBB1_4: ; %Flow28 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_30 -; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 +; GLOBALNESS0-NEXT: .LBB1_5: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0x80 @@ -420,52 +429,54 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[44:45] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 -; GLOBALNESS0-NEXT: ; %bb.5: ; %NodeBlock -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_10 +; GLOBALNESS0-NEXT: ; %bb.6: ; %NodeBlock +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_cmp_lt_i32 s75, 1 -; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_7 -; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock12 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_8 +; GLOBALNESS0-NEXT: ; %bb.7: ; %LeafBlock12 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_8 -; GLOBALNESS0-NEXT: s_branch .LBB1_9 -; GLOBALNESS0-NEXT: .LBB1_7: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_9 +; GLOBALNESS0-NEXT: s_branch .LBB1_10 +; GLOBALNESS0-NEXT: .LBB1_8: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS0-NEXT: .LBB1_8: ; %LeafBlock -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: .LBB1_9: ; %LeafBlock +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow25 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: .LBB1_10: ; %Flow25 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 -; GLOBALNESS0-NEXT: ; %bb.10: ; %baz.exit.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; %bb.11: ; %baz.exit.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[2:3] +; GLOBALNESS0-NEXT: s_mov_b64 s[72:73], exec ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[60:61], 0, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[72:73], s[60:61] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 -; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cmp_lg_u64 s[60:61], 0 +; GLOBALNESS0-NEXT: s_cmov_b64 exec, s[60:61] +; GLOBALNESS0-NEXT: s_cbranch_scc0 .LBB1_26 +; GLOBALNESS0-NEXT: ; %bb.12: ; %bb33.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 -; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 +; GLOBALNESS0-NEXT: ; %bb.13: ; %bb39.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off -; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: .LBB1_14: ; %bb44.lr.ph.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) @@ -474,15 +485,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[62:63], 0, v2 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 ; GLOBALNESS0-NEXT: s_branch .LBB1_16 -; GLOBALNESS0-NEXT: .LBB1_14: ; %Flow16 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i -; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; Parent Loop BB1_5 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[46:47] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 @@ -532,37 +540,43 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[76:77] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], exec +; GLOBALNESS0-NEXT: s_and_b64 s[6:7], s[62:63], exec +; GLOBALNESS0-NEXT: s_cmov_b64 exec, s[6:7] +; GLOBALNESS0-NEXT: s_cbranch_scc0 .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[42:43], off -; GLOBALNESS0-NEXT: s_branch .LBB1_14 -; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] +; GLOBALNESS0-NEXT: s_branch .LBB1_15 +; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GLOBALNESS0-NEXT: s_branch .LBB1_3 +; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow23 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow24 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[72:73] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[60:61] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 +; GLOBALNESS0-NEXT: .LBB1_26: ; %bb64.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 +; GLOBALNESS0-NEXT: s_and_b64 s[6:7], s[60:61], exec +; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GLOBALNESS0-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GLOBALNESS0-NEXT: s_cmov_b64 exec, s[6:7] +; GLOBALNESS0-NEXT: s_cbranch_scc0 .LBB1_3 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 ; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index a90454f50d198c..8e7c33395ab0f3 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -310,39 +310,42 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-LABEL: v_test_udiv_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v11, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[6:7], v10, v11 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[6:7] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v10, v11 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[6:7] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[8:9], -1 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] +; GCN-IR-NEXT: s_mov_b64 s[6:7], exec +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[8:9] +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v6 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v6 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: s_xor_b64 s[8:9], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2 ; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v8 @@ -360,34 +363,34 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0 ; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v6, v0 -; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: .LBB1_6: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v4 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1205,26 +1208,29 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v3, 0x8000 +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB9_6 +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GCN-IR-NEXT: s_cbranch_execz .LBB9_5 +; GCN-IR-NEXT: s_and_b64 s[10:11], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc @@ -1240,36 +1246,36 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB9_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 -; GCN-IR-NEXT: .LBB9_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 +; GCN-IR-NEXT: v_or_b32_e32 v3, v5, v1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v0 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB9_6: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v3 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 32768, %x ret i64 %result @@ -1294,25 +1300,28 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB10_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB10_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB10_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB10_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v6 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffcf, v8 @@ -1337,23 +1346,23 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8 ; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB10_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB10_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB10_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 -; GCN-IR-NEXT: .LBB10_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB10_6: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1494,25 +1503,28 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v6 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v8 @@ -1536,23 +1548,23 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8 ; GCN-IR-NEXT: v_and_b32_e32 v8, 24, v8 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB12_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 -; GCN-IR-NEXT: .LBB12_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB12_6: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll index a3fc6ded0a0047..561285b97626dc 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -598,8 +598,9 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; SI-LABEL: uniform_inside_divergent: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB11_2 +; SI-NEXT: s_and_b64 s[0:1], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB11_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_load_dword s4, s[2:3], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -621,8 +622,9 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; VI-LABEL: uniform_inside_divergent: ; VI: ; %bb.0: ; %entry ; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cmp_lg_u64 vcc, 0 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB11_2 ; VI-NEXT: ; %bb.1: ; %if ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -668,14 +670,15 @@ define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 % ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB12_2: ; %if ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_cbranch_execz .LBB12_1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB12_1 ; SI-NEXT: ; %bb.3: ; %if_uniform ; SI-NEXT: v_mov_b32_e32 v0, 1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -691,14 +694,15 @@ define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 % ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB12_2: ; %if ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; VI-NEXT: s_cmp_lg_u64 vcc, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB12_1 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB12_1 ; VI-NEXT: ; %bb.3: ; %if_uniform ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -726,16 +730,18 @@ define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %c ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB13_2 +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_and_b64 s[6:7], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB13_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: .LBB13_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: .LBB13_2: ; %endif ; SI-NEXT: s_load_dword s0, s[2:3], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s0, 0 @@ -754,16 +760,18 @@ define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %c ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_mov_b64 s[0:1], exec +; VI-NEXT: s_cmp_lg_u64 vcc, 0 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB13_2 ; VI-NEXT: ; %bb.1: ; %if ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; VI-NEXT: .LBB13_2: ; %endif ; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: .LBB13_2: ; %endif ; VI-NEXT: s_load_dword s0, s[2:3], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll b/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll index 64d4a0cf785013..98036b9e130706 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll @@ -13,10 +13,11 @@ define amdgpu_ps float @uniform_phi_with_undef(float inreg %c, float %v, i32 %x, ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_cmp_lt_i32_e64 s2, v2, v1 ; GCN-NEXT: s_mov_b32 s1, exec_lo -; GCN-NEXT: s_and_b32 s2, s1, s2 -; GCN-NEXT: s_mov_b32 exec_lo, s2 -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, s2 +; GCN-NEXT: s_cbranch_scc1 .LBB0_1 +; GCN-NEXT: s_branch .LBB0_2 +; GCN-NEXT: .LBB0_1: ; %if ; GCN-NEXT: s_mov_b32 s2, 0x40400000 ; GCN-NEXT: v_div_scale_f32 v1, s3, s2, s2, v0 ; GCN-NEXT: v_rcp_f32_e64 v2, v1 @@ -30,8 +31,8 @@ define amdgpu_ps float @uniform_phi_with_undef(float inreg %c, float %v, i32 %x, ; GCN-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GCN-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GCN-NEXT: v_div_fixup_f32 v0, v1, s2, v0 -; GCN-NEXT: .LBB0_2: ; %end ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GCN-NEXT: .LBB0_2: ; %end ; GCN-NEXT: v_add_f32_e64 v0, v0, s0 ; GCN-NEXT: ; return to shader part epilog entry: diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index a5e1506114f2d0..2c5c29a4c5cad1 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -75,16 +75,17 @@ define hidden void @widget() { ; GCN-NEXT: s_and_b64 s[20:21], vcc, exec ; GCN-NEXT: s_or_b64 s[46:47], s[18:19], s[20:21] ; GCN-NEXT: .LBB0_4: ; %Flow2 -; GCN-NEXT: s_and_saveexec_b64 s[18:19], s[46:47] -; GCN-NEXT: s_xor_b64 s[18:19], exec, s[18:19] -; GCN-NEXT: s_cbranch_execz .LBB0_6 +; GCN-NEXT: s_and_b64 s[20:21], s[46:47], exec +; GCN-NEXT: s_mov_b64 s[18:19], exec +; GCN-NEXT: s_cmov_b64 exec, s[20:21] +; GCN-NEXT: s_cbranch_scc0 .LBB0_6 ; GCN-NEXT: ; %bb.5: ; %bb12 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: flat_store_dword v[0:1], v2 -; GCN-NEXT: .LBB0_6: ; %Flow3 ; GCN-NEXT: s_or_b64 exec, exec, s[18:19] +; GCN-NEXT: .LBB0_6: ; %Flow3 ; GCN-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GCN-NEXT: s_cbranch_vccnz .LBB0_8 ; GCN-NEXT: ; %bb.7: ; %bb7 @@ -315,27 +316,30 @@ define hidden void @blam() { ; GCN-NEXT: s_branch .LBB1_2 ; GCN-NEXT: .LBB1_1: ; %Flow7 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_and_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[50:51], s[4:5], s[50:51] -; GCN-NEXT: s_andn2_b64 exec, exec, s[50:51] -; GCN-NEXT: s_cbranch_execz .LBB1_18 +; GCN-NEXT: s_andn2_b64 s[4:5], exec, s[50:51] +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[50:51] +; GCN-NEXT: s_cbranch_scc0 .LBB1_18 ; GCN-NEXT: .LBB1_2: ; %bb2 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: flat_load_dword v0, v[41:42] ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], 0 -; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 2, v0 +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_xor_b64 s[54:55], vcc, exec +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 ; GCN-NEXT: s_mov_b64 s[4:5], -1 -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_xor_b64 s[54:55], exec, s[8:9] -; GCN-NEXT: s_cbranch_execz .LBB1_12 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_12 ; GCN-NEXT: ; %bb.3: ; %bb6 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_mov_b64 s[56:57], exec ; GCN-NEXT: v_cmp_eq_u32_e64 s[44:45], 3, v0 -; GCN-NEXT: s_and_saveexec_b64 s[56:57], s[44:45] -; GCN-NEXT: s_cbranch_execz .LBB1_11 +; GCN-NEXT: s_and_b64 s[4:5], s[44:45], -1 +; GCN-NEXT: s_cmov_b64 exec, s[44:45] +; GCN-NEXT: s_cbranch_scc0 .LBB1_11 ; GCN-NEXT: ; %bb.4: ; %bb11 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_getpc_b64 s[16:17] @@ -352,81 +356,93 @@ define hidden void @blam() { ; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, exec +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 ; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_10 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_10 ; GCN-NEXT: ; %bb.5: ; %bb14 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_and_b64 s[10:11], s[42:43], exec ; GCN-NEXT: s_mov_b64 s[8:9], s[52:53] -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[42:43] -; GCN-NEXT: s_cbranch_execz .LBB1_7 +; GCN-NEXT: s_cmov_b64 exec, s[10:11] +; GCN-NEXT: s_cbranch_scc0 .LBB1_7 ; GCN-NEXT: ; %bb.6: ; %bb16 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_or_b64 s[8:9], s[52:53], exec +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB1_7: ; %Flow3 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[8:9] -; GCN-NEXT: s_xor_b64 s[8:9], exec, s[10:11] -; GCN-NEXT: s_cbranch_execz .LBB1_9 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_and_b64 s[10:11], s[8:9], exec +; GCN-NEXT: s_mov_b64 s[8:9], 0 +; GCN-NEXT: s_cmov_b64 exec, s[10:11] +; GCN-NEXT: s_cbranch_scc0 .LBB1_9 ; GCN-NEXT: ; %bb.8: ; %bb17 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_mov_b64 s[8:9], exec ; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB1_9: ; %Flow4 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_b64 s[6:7], s[8:9], exec +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB1_10: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_andn2_b64 s[4:5], s[44:45], exec ; GCN-NEXT: s_and_b64 s[8:9], vcc, exec ; GCN-NEXT: s_or_b64 s[44:45], s[4:5], s[8:9] ; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_or_b64 exec, exec, s[56:57] ; GCN-NEXT: .LBB1_11: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[56:57] ; GCN-NEXT: s_orn2_b64 s[4:5], s[44:45], exec ; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_or_b64 exec, exec, s[54:55] ; GCN-NEXT: .LBB1_12: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[54:55] -; GCN-NEXT: s_cbranch_execz .LBB1_16 +; GCN-NEXT: s_xor_b64 s[8:9], s[54:55], exec +; GCN-NEXT: s_and_b64 s[10:11], s[54:55], -1 +; GCN-NEXT: s_cmov_b64 exec, s[54:55] +; GCN-NEXT: s_cbranch_scc0 .LBB1_16 ; GCN-NEXT: ; %bb.13: ; %bb8 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_and_b64 s[10:11], vcc, -1 ; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] -; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_15 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_15 ; GCN-NEXT: ; %bb.14: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_or_b64 s[10:11], s[6:7], exec +; GCN-NEXT: s_or_b64 exec, exec, s[12:13] ; GCN-NEXT: .LBB1_15: ; %Flow6 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[12:13] ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_and_b64 s[12:13], vcc, exec ; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec ; GCN-NEXT: s_and_b64 s[10:11], s[10:11], exec ; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: .LBB1_16: ; %Flow5 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB1_1 +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], exec +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_cmov_b64 exec, s[8:9] +; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.17: ; %bb18 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_branch .LBB1_1 ; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock -; GCN-NEXT: s_or_b64 exec, exec, s[50:51] ; GCN-NEXT: v_readlane_b32 s57, v45, 25 ; GCN-NEXT: v_readlane_b32 s56, v45, 24 ; GCN-NEXT: v_readlane_b32 s55, v45, 23 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 3f346db3f3e665..03d1c64f335031 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -319,39 +319,42 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-LABEL: v_test_urem_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v13 +; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[8:9], -1 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] +; GCN-IR-NEXT: s_mov_b64 s[6:7], exec +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[8:9] +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: s_xor_b64 s[8:9], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc @@ -369,34 +372,34 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 ; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 -; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: .LBB1_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v4, v2, v7 ; GCN-IR-NEXT: v_mul_hi_u32 v5, v2, v6 ; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 @@ -1191,22 +1194,25 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB8_6 +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB8_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GCN-IR-NEXT: s_cbranch_execz .LBB8_5 +; GCN-IR-NEXT: s_and_b64 s[10:11], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB8_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc @@ -1222,34 +1228,34 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB8_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB8_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB8_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB8_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB8_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 ; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 @@ -1282,25 +1288,28 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec ; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB9_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB9_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v6 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v10 @@ -1325,23 +1334,23 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB9_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB9_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB9_6: ; %udiv-end ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[4:5], 15 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll index 1f36f7a0d9616e..4813109278276b 100644 --- a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll @@ -19,11 +19,11 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX11-TRUE16-NEXT: v_swap_b16 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-TRUE16-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %ret -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: swap: @@ -32,16 +32,16 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB0_1: ; %loop ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2 ; GFX11-FAKE16-NEXT: v_swap_b32 v1, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 s1, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-FAKE16-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-FAKE16-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %ret -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -63,11 +63,11 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX12-TRUE16-NEXT: v_swap_b16 v0.l, v0.h ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-TRUE16-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-TRUE16-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %ret -; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: swap: @@ -80,16 +80,16 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB0_1: ; %loop ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2 ; GFX12-FAKE16-NEXT: v_swap_b32 v1, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_and_not1_b32 s1, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-FAKE16-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-FAKE16-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %ret -; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll index 0211c5111c31dd..46ae1dde12dc82 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -26,13 +26,13 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 { ; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] ; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 ; GCN-NEXT: s_and_saveexec_b32 s4, s4 +; GCN-NEXT: s_xor_b32 s4, exec_lo, s4 ; GCN-NEXT: buffer_store_dword v0, v0, s[8:11], 0 offen ; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 -; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: s_cbranch_execnz .LBB0_2 +; GCN-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.3: ; in Loop: Header=BB0_1 Depth=1 -; GCN-NEXT: s_mov_b32 exec_lo, s5 ; GCN-NEXT: s_mov_b32 vcc_lo, exec_lo ; GCN-NEXT: s_cbranch_vccnz .LBB0_1 ; GCN-NEXT: ; %bb.4: ; %DummyReturnBlock @@ -60,13 +60,13 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: buffer_store_b32 v0, v0, s[4:7], 0 offen ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_2 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX11-NEXT: ; %bb.3: ; in Loop: Header=BB0_1 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX11-NEXT: ; %bb.4: ; %DummyReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 2a280bcda42f52..6ede19864891c1 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -26,17 +26,18 @@ define amdgpu_ps float @else1(i32 %z, float %v) #0 { ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[PHI1]], 0, [[PHI1]], 0, 0, implicit $mode, implicit $exec + ; SI-NEXT: SI_WAVE_RECONVERGE killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.4 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.3.else: ; SI-NEXT: successors: %bb.1(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, 1077936128, 0, killed [[COPY]], 0, 0, implicit $mode, implicit $exec + ; SI-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.4.end: ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[V_ADD_F32_e64_]], %bb.2 - ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: $vgpr0 = COPY killed [[PHI2]] ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 main_body: @@ -82,18 +83,19 @@ define amdgpu_ps float @else2(i32 %z, float %v) #0 { ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; SI-NEXT: SI_WAVE_RECONVERGE killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.4 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.3.else: ; SI-NEXT: successors: %bb.1(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, 1077936128, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; SI-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.4.end: ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, [[V_ADD_F32_e64_]], %bb.2 ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[V_ADD_F32_e64_]], %bb.2 - ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[PHI2]], 0, killed [[PHI3]], 0, 0, implicit $mode, implicit $exec ; SI-NEXT: $vgpr0 = COPY killed [[V_ADD_F32_e64_1]] ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 @@ -152,6 +154,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[PHI]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; SI-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, killed [[PHI4]], 0, implicit $exec + ; SI-NEXT: SI_WAVE_RECONVERGE killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.5 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.4.else: @@ -159,6 +162,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY2]], 0, [[PHI1]], 0, 0, implicit $mode, implicit $exec ; SI-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 killed [[PHI1]], 1, [[PHI1]], implicit $exec + ; SI-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.5.if.end: @@ -166,7 +170,6 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-NEXT: {{ $}} ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.2, [[V_MUL_F32_e64_]], %bb.3 ; SI-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.2, [[V_ADD_U32_e64_]], %bb.3 - ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[PHI6]], 0, implicit $exec ; SI-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[PHI]], 1, implicit-def dead $scc ; SI-NEXT: S_CMP_LT_I32 [[S_ADD_I32_]], [[COPY1]], implicit-def $scc @@ -233,10 +236,10 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %47:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %49:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %51:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %53:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %49:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %51:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %53:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %55:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -250,8 +253,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 - ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.4, [[PHI1]], %bb.2 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %57:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %59:vgpr_32, %bb.4, [[PHI1]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -267,14 +270,14 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.3, implicit $exec + ; SI-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; SI-NEXT: SI_WATERFALL_LOOP killed [[S_XOR_B32_term]], [[S_MOV_B32_]], %bb.3, implicit killed $scc ; SI-NEXT: {{ $}} ; SI-NEXT: bb.5: ; SI-NEXT: successors: %bb.10(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] ; SI-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] + ; SI-NEXT: SI_WAVE_RECONVERGE killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.10 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.else: @@ -287,8 +290,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %59:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 - ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %61:vgpr_32, %bb.8, [[COPY4]], %bb.6 + ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %61:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.8, [[COPY4]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -304,19 +307,18 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.7, implicit $exec + ; SI-NEXT: [[S_XOR_B32_term1:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc + ; SI-NEXT: SI_WATERFALL_LOOP killed [[S_XOR_B32_term1]], [[S_MOV_B32_1]], %bb.7, implicit killed $scc ; SI-NEXT: {{ $}} ; SI-NEXT: bb.9: ; SI-NEXT: successors: %bb.1(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_1]] ; SI-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]] + ; SI-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.10.end: ; SI-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.5 - ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: $vgpr0 = COPY killed [[PHI8]] ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 main_body: @@ -356,9 +358,9 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %48:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %50:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %52:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %50:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %52:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %54:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -372,7 +374,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %54:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -388,14 +390,14 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.3, implicit $exec + ; SI-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; SI-NEXT: SI_WATERFALL_LOOP killed [[S_XOR_B32_term]], [[S_MOV_B32_]], %bb.3, implicit killed $scc ; SI-NEXT: {{ $}} ; SI-NEXT: bb.5: ; SI-NEXT: successors: %bb.10(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] ; SI-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] + ; SI-NEXT: SI_WAVE_RECONVERGE killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.10 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.else: @@ -408,7 +410,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %58:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -424,19 +426,18 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.7, implicit $exec + ; SI-NEXT: [[S_XOR_B32_term1:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc + ; SI-NEXT: SI_WATERFALL_LOOP killed [[S_XOR_B32_term1]], [[S_MOV_B32_1]], %bb.7, implicit killed $scc ; SI-NEXT: {{ $}} ; SI-NEXT: bb.9: ; SI-NEXT: successors: %bb.1(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_1]] ; SI-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]] + ; SI-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.10.end: ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.5 - ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[PHI5]], 0, killed [[COPY4]], 0, 0, implicit $mode, implicit $exec ; SI-NEXT: $vgpr0 = COPY killed [[V_ADD_F32_e64_]] ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 @@ -480,6 +481,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) ; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) + ; SI-NEXT: SI_WAVE_RECONVERGE killed %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.7 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.2.if.then9: @@ -512,10 +514,10 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %41:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) + ; SI-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.5 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.7.UnifiedReturnBlock: - ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_ENDPGM 0 entry: %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -636,18 +638,16 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %28:vreg_64, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.4, implicit $exec + ; SI-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def $scc + ; SI-NEXT: SI_WATERFALL_LOOP killed [[S_XOR_B32_term]], [[S_MOV_B32_1]], %bb.4, implicit killed $scc ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.7(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_1]] - ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; SI-NEXT: [[S_XOR_B32_term1:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; SI-NEXT: SI_WATERFALL_LOOP killed [[S_XOR_B32_term1]], [[S_MOV_B32_]], %bb.2, implicit killed $scc ; SI-NEXT: {{ $}} ; SI-NEXT: bb.7: - ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] ; SI-NEXT: GLOBAL_STORE_DWORD undef %31:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; SI-NEXT: S_ENDPGM 0 entry: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll index 25d8300eb45835..74badb13218f56 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -7,25 +7,24 @@ define amdgpu_ps float @else1(i32 %z, float %v) #0 { ; SI: ; %bb.0: ; %main_body ; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo -; SI-NEXT: s_xor_b32 s0, exec_lo, s0 -; SI-NEXT: s_cbranch_execnz .LBB0_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b32 s0, s0 -; SI-NEXT: s_cbranch_execnz .LBB0_4 -; SI-NEXT: .LBB0_2: ; %end -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; SI-NEXT: s_branch .LBB0_5 -; SI-NEXT: .LBB0_3: ; %else +; SI-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; SI-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; SI-NEXT: s_cmov_b32 exec_lo, vcc_lo +; SI-NEXT: s_cbranch_scc0 .LBB0_2 +; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_andn2_saveexec_b32 s0, s0 -; SI-NEXT: s_cbranch_execz .LBB0_2 -; SI-NEXT: .LBB0_4: ; %if -; SI-NEXT: v_add_f32_e32 v0, v1, v1 ; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; SI-NEXT: s_branch .LBB0_5 -; SI-NEXT: .LBB0_5: +; SI-NEXT: .LBB0_2: ; %Flow +; SI-NEXT: s_xor_b32 s1, s0, exec_lo +; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmov_b32 exec_lo, s0 +; SI-NEXT: s_cbranch_scc0 .LBB0_4 +; SI-NEXT: ; %bb.3: ; %if +; SI-NEXT: v_add_f32_e32 v0, v1, v1 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; SI-NEXT: .LBB0_4: ; %end +; SI-NEXT: ; return to shader part epilog main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else @@ -50,17 +49,23 @@ define amdgpu_ps float @else2(i32 %z, float %v) #0 { ; SI: ; %bb.0: ; %main_body ; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo -; SI-NEXT: s_xor_b32 s0, exec_lo, s0 +; SI-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; SI-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; SI-NEXT: s_cmov_b32 exec_lo, vcc_lo +; SI-NEXT: s_cbranch_scc0 .LBB1_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 -; SI-NEXT: ; %bb.2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b32 s0, s0 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; SI-NEXT: .LBB1_2: ; %Flow +; SI-NEXT: s_xor_b32 s1, s0, exec_lo +; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmov_b32 exec_lo, s0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.3: ; %if ; SI-NEXT: v_add_f32_e32 v1, v1, v1 ; SI-NEXT: v_mov_b32_e32 v0, v1 -; SI-NEXT: ; %bb.4: ; %end -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; SI-NEXT: .LBB1_4: ; %end ; SI-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-NEXT: ; return to shader part epilog main_body: @@ -91,30 +96,36 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-NEXT: s_branch .LBB2_2 ; SI-NEXT: .LBB2_1: ; %if.end ; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; SI-NEXT: v_add_nc_u32_e32 v2, 1, v3 ; SI-NEXT: s_add_i32 s1, s1, 1 ; SI-NEXT: s_cmp_lt_i32 s1, s0 ; SI-NEXT: s_cbranch_scc0 .LBB2_6 ; SI-NEXT: .LBB2_2: ; %for.body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_and_b32 s3, vcc_lo, exec_lo ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_and_saveexec_b32 s2, vcc_lo -; SI-NEXT: s_xor_b32 s2, exec_lo, s2 +; SI-NEXT: s_xor_b32 s2, s3, exec_lo +; SI-NEXT: s_cmp_lg_u32 s3, 0 +; SI-NEXT: s_cmov_b32 exec_lo, s3 +; SI-NEXT: s_cbranch_scc0 .LBB2_4 ; SI-NEXT: ; %bb.3: ; %else ; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; SI-NEXT: v_mul_f32_e32 v0, v1, v2 ; SI-NEXT: v_lshl_add_u32 v3, v2, 1, v2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; %bb.4: ; %Flow +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; SI-NEXT: .LBB2_4: ; %Flow ; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; SI-NEXT: s_andn2_saveexec_b32 s2, s2 -; SI-NEXT: s_cbranch_execz .LBB2_1 +; SI-NEXT: s_xor_b32 s3, s2, exec_lo +; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cmov_b32 exec_lo, s2 +; SI-NEXT: s_cbranch_scc0 .LBB2_1 ; SI-NEXT: ; %bb.5: ; %if ; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; SI-NEXT: v_mul_f32_e32 v0, s1, v1 ; SI-NEXT: v_add_nc_u32_e32 v3, 1, v2 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; SI-NEXT: s_branch .LBB2_1 ; SI-NEXT: .LBB2_6: ; %for.end ; SI-NEXT: v_add_f32_e32 v0, v3, v0 @@ -165,16 +176,17 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: v_mov_b32_e32 v0, v1 -; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6 ; SI-NEXT: s_mov_b32 s15, 0x31c16000 +; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6 ; SI-NEXT: s_add_u32 s12, s12, s1 +; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_mov_b32 s32, 0 +; SI-NEXT: s_xor_b32 s6, vcc_lo, exec_lo +; SI-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo -; SI-NEXT: s_xor_b32 s6, exec_lo, s0 -; SI-NEXT: s_cbranch_execz .LBB3_4 +; SI-NEXT: s_cmov_b32 exec_lo, vcc_lo +; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s7, exec_lo ; SI-NEXT: .LBB3_2: ; =>This Inner Loop Header: Depth=1 @@ -186,19 +198,22 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: s_mov_b64 s[2:3], s[14:15] ; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: s_xor_b32 s0, exec_lo, s8 ; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 -; SI-NEXT: s_cbranch_execnz .LBB3_2 +; SI-NEXT: s_cselect_b32 exec_lo, s0, s7 +; SI-NEXT: s_cbranch_scc1 .LBB3_2 ; SI-NEXT: ; %bb.3: -; SI-NEXT: s_mov_b32 exec_lo, s7 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; SI-NEXT: .LBB3_4: ; %Flow -; SI-NEXT: s_andn2_saveexec_b32 s6, s6 -; SI-NEXT: s_cbranch_execz .LBB3_8 +; SI-NEXT: s_xor_b32 s7, s6, exec_lo +; SI-NEXT: s_cmp_lg_u32 s6, 0 +; SI-NEXT: s_cmov_b32 exec_lo, s6 +; SI-NEXT: s_cbranch_scc0 .LBB3_8 ; SI-NEXT: ; %bb.5: ; %if -; SI-NEXT: s_mov_b32 s7, exec_lo +; SI-NEXT: s_mov_b32 s6, exec_lo ; SI-NEXT: .LBB3_6: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: v_readfirstlane_b32 s5, v3 @@ -208,14 +223,14 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: s_mov_b64 s[2:3], s[14:15] ; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: s_xor_b32 s0, exec_lo, s8 ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 -; SI-NEXT: s_cbranch_execnz .LBB3_6 +; SI-NEXT: s_cselect_b32 exec_lo, s0, s6 +; SI-NEXT: s_cbranch_scc1 .LBB3_6 ; SI-NEXT: ; %bb.7: -; SI-NEXT: s_mov_b32 exec_lo, s7 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; SI-NEXT: .LBB3_8: ; %end -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: ; return to shader part epilog main_body: @@ -241,17 +256,18 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI: ; %bb.0: ; %main_body ; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_mov_b32 s15, 0x31c16000 ; SI-NEXT: s_add_u32 s12, s12, s1 +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_xor_b32 s6, vcc_lo, exec_lo +; SI-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; SI-NEXT: s_mov_b32 s32, 0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo -; SI-NEXT: s_xor_b32 s6, exec_lo, s0 -; SI-NEXT: s_cbranch_execz .LBB4_4 +; SI-NEXT: s_cmov_b32 exec_lo, vcc_lo +; SI-NEXT: s_cbranch_scc0 .LBB4_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s7, exec_lo ; SI-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1 @@ -263,17 +279,20 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: s_mov_b64 s[0:1], s[12:13] ; SI-NEXT: s_mov_b64 s[2:3], s[14:15] ; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_xor_b32 s0, exec_lo, s8 ; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 -; SI-NEXT: s_cbranch_execnz .LBB4_2 +; SI-NEXT: s_cselect_b32 exec_lo, s0, s7 +; SI-NEXT: s_cbranch_scc1 .LBB4_2 ; SI-NEXT: ; %bb.3: -; SI-NEXT: s_mov_b32 exec_lo, s7 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; SI-NEXT: .LBB4_4: ; %Flow -; SI-NEXT: s_andn2_saveexec_b32 s6, s6 -; SI-NEXT: s_cbranch_execz .LBB4_8 +; SI-NEXT: s_xor_b32 s7, s6, exec_lo +; SI-NEXT: s_cmp_lg_u32 s6, 0 +; SI-NEXT: s_cmov_b32 exec_lo, s6 +; SI-NEXT: s_cbranch_scc0 .LBB4_8 ; SI-NEXT: ; %bb.5: ; %if -; SI-NEXT: s_mov_b32 s7, exec_lo +; SI-NEXT: s_mov_b32 s6, exec_lo ; SI-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: v_readfirstlane_b32 s5, v3 @@ -283,13 +302,13 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: s_mov_b64 s[0:1], s[12:13] ; SI-NEXT: s_mov_b64 s[2:3], s[14:15] ; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_xor_b32 s0, exec_lo, s8 ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 -; SI-NEXT: s_cbranch_execnz .LBB4_6 +; SI-NEXT: s_cselect_b32 exec_lo, s0, s6 +; SI-NEXT: s_cbranch_scc1 .LBB4_6 ; SI-NEXT: ; %bb.7: -; SI-NEXT: s_mov_b32 exec_lo, s7 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; SI-NEXT: .LBB4_8: ; %end -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; SI-NEXT: v_add_f32_e32 v0, v0, v40 ; SI-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll index 4efa1e9353ab3a..23b650bb016d20 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll @@ -75,15 +75,17 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK: ; %bb.0: ; %.entry ; CHECK-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: s_mov_b32 s0, exec_lo ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; CHECK-NEXT: v_lshlrev_b64_e32 v[3:4], 2, v[3:4] ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:336 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; CHECK-NEXT: scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:448 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 @@ -94,9 +96,8 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:720 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: v_cmpx_eq_u32_e32 0, v2 -; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0 -; CHECK-NEXT: s_cbranch_execz .LBB1_2 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB1_2 ; CHECK-NEXT: ; %bb.1: ; %.false ; CHECK-NEXT: global_load_b32 v10, v[0:1], off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 @@ -153,9 +154,13 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; CHECK-NEXT: .LBB1_2: ; %Flow -; CHECK-NEXT: s_and_not1_saveexec_b32 s0, s0 -; CHECK-NEXT: s_cbranch_execz .LBB1_4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_xor_b32 s1, s0, exec_lo +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, s0 +; CHECK-NEXT: s_cbranch_scc0 .LBB1_4 ; CHECK-NEXT: ; %bb.3: ; %.true ; CHECK-NEXT: global_load_b32 v10, v[0:1], off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 @@ -207,8 +212,8 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; CHECK-NEXT: .LBB1_4: ; %.exit -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; CHECK-NEXT: scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll index 50927a2cf21afe..83a46905483b43 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -24,27 +24,28 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: ds_write_b8 v1, v2 -; CHECK-NEXT: s_mov_b64 s[4:5], exec -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: s_mov_b64 s[6:7], exec +; CHECK-NEXT: v_writelane_b32 v0, s6, 0 +; CHECK-NEXT: v_writelane_b32 v0, s7, 1 ; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[8:9] -; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: ; %bb.1: ; %bb193 -; CHECK-NEXT: .LBB0_2: ; %bb194 +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 +; CHECK-NEXT: s_cmov_b64 exec, s[4:5] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 +; CHECK-NEXT: s_branch .LBB0_2 +; CHECK-NEXT: .LBB0_1: ; %bb193 ; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s4, v1, 0 -; CHECK-NEXT: v_readlane_b32 s5, v1, 1 +; CHECK-NEXT: v_readlane_b32 s4, v0, 0 +; CHECK-NEXT: v_readlane_b32 s5, v0, 1 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB0_2: ; %bb194 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index 33c06e5d1e3a5e..e3337bd2a4e4d0 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -8,29 +8,31 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 8 -; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: s_mov_b32 s2, 0xff0000 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dword v4, v2, s[4:5] -; GFX906-NEXT: s_mov_b32 s4, 0xff0000 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec +; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX906-NEXT: v_and_or_b32 v4, v4, s4, v5 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB0_2 +; GFX906-NEXT: v_and_or_b32 v0, v4, s2, v5 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v0, v2, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX906-NEXT: v_and_or_b32 v4, v0, s4, v2 +; GFX906-NEXT: v_and_or_b32 v0, v0, s2, v2 +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB0_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[0:1] offset:2 -; GFX906-NEXT: global_store_short v1, v4, s[0:1] +; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[0:1] offset:2 +; GFX906-NEXT: global_store_short v1, v0, s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -54,19 +56,21 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v2, v3, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB1_2 +; GFX906-NEXT: global_load_dword v1, v2, s[4:5] +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dword v2, v3, s[6:7] +; GFX906-NEXT: global_load_dword v1, v2, s[6:7] +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB1_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v1, v2, s[0:1] +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -91,20 +95,22 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] +; GFX906-NEXT: v_mov_b32_e32 v3, 0 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB2_2 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB2_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: global_store_byte v3, v2, s[0:1] offset:4 ; GFX906-NEXT: global_store_dword v3, v1, s[0:1] ; GFX906-NEXT: s_endpgm @@ -130,19 +136,21 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX906-NEXT: v_mov_b32_e32 v3, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB3_2 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB3_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -166,19 +174,21 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v6, 4, v0 -; GFX906-NEXT: v_mov_b32_e32 v5, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB4_2 +; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[4:5] +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[6:7] +; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[6:7] +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB4_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -202,23 +212,25 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v10, 5, v0 -; GFX906-NEXT: v_mov_b32_e32 v9, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v9, 5, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[4:5] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB5_2 +; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[4:5] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[4:5] +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[6:7] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[6:7] +; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[6:7] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[6:7] +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB5_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[0:1] offset:16 +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -252,7 +264,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_add_u32 s12, s12, s9 ; GFX906-NEXT: s_addc_u32 s13, s13, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: v_mov_b32_e32 v4, 0 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -275,8 +289,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[4:5] offset:32 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[4:5] offset:16 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB6_2 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] offset:240 ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -300,8 +314,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[6:7] offset:32 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[6:7] offset:16 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB6_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(7) ; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[0:1] offset:112 ; GFX906-NEXT: s_waitcnt vmcnt(7) @@ -407,28 +421,32 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace( ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[0:1], exec +; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB8_2 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 -; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX906-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX906-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: .LBB8_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX906-NEXT: s_cbranch_execz .LBB8_4 +; GFX906-NEXT: s_mov_b64 s[0:1], exec +; GFX906-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX906-NEXT: s_cmov_b64 exec, s[2:3] +; GFX906-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX906-NEXT: ; %bb.3: ; %bb.2 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: .LBB8_4: ; %bb.3 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11] @@ -462,13 +480,15 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX906-NEXT: s_mov_b64 s[0:1], exec +; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB9_2 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX906-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[6:7] ; GFX906-NEXT: s_mov_b32 s4, 0 @@ -477,21 +497,23 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_mov_b32_e32 v3, s4 ; GFX906-NEXT: v_mov_b32_e32 v4, s5 -; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX906-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX906-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: .LBB9_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX906-NEXT: s_cbranch_execz .LBB9_4 +; GFX906-NEXT: s_mov_b64 s[0:1], exec +; GFX906-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX906-NEXT: s_cmov_b64 exec, s[2:3] +; GFX906-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX906-NEXT: ; %bb.3: ; %bb.2 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, v3 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: v_mov_b32_e32 v2, v4 ; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: .LBB9_4: ; %bb.3 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11] @@ -524,17 +546,19 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[0:1], exec +; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX906-NEXT: ; implicit-def: $vgpr3 -; GFX906-NEXT: ; implicit-def: $vgpr13 -; GFX906-NEXT: ; implicit-def: $vgpr11 ; GFX906-NEXT: ; implicit-def: $vgpr14 -; GFX906-NEXT: ; implicit-def: $vgpr15 ; GFX906-NEXT: ; implicit-def: $vgpr12 +; GFX906-NEXT: ; implicit-def: $vgpr15 ; GFX906-NEXT: ; implicit-def: $vgpr16 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] +; GFX906-NEXT: ; implicit-def: $vgpr11 +; GFX906-NEXT: ; implicit-def: $vgpr13 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v2 @@ -542,12 +566,12 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_lshrrev_b32_e32 v8, 24, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v10, 8, v1 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB10_2 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[3:4], v4, s[6:7] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 -; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX906-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX906-NEXT: v_mov_b32_e32 v1, 1 ; GFX906-NEXT: v_mov_b32_e32 v10, 2 @@ -557,18 +581,20 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_mov_b32_e32 v7, 6 ; GFX906-NEXT: v_mov_b32_e32 v6, 7 ; GFX906-NEXT: v_mov_b32_e32 v5, 8 -; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX906-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v16, 24, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v15, 8, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v14, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v13, 24, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v16, 8, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v3 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: .LBB10_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX906-NEXT: s_cbranch_execz .LBB10_4 +; GFX906-NEXT: s_mov_b64 s[0:1], exec +; GFX906-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX906-NEXT: s_cmov_b64 exec, s[2:3] +; GFX906-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX906-NEXT: ; %bb.3: ; %bb.2 ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v10 ; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v8 @@ -583,24 +609,24 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9] ; GFX906-NEXT: v_mov_b32_e32 v3, v1 -; GFX906-NEXT: v_mov_b32_e32 v13, v10 -; GFX906-NEXT: v_mov_b32_e32 v11, v9 -; GFX906-NEXT: v_mov_b32_e32 v14, v8 +; GFX906-NEXT: v_mov_b32_e32 v14, v10 +; GFX906-NEXT: v_mov_b32_e32 v12, v9 +; GFX906-NEXT: v_mov_b32_e32 v15, v8 ; GFX906-NEXT: v_mov_b32_e32 v4, v2 -; GFX906-NEXT: v_mov_b32_e32 v15, v7 -; GFX906-NEXT: v_mov_b32_e32 v12, v6 -; GFX906-NEXT: v_mov_b32_e32 v16, v5 +; GFX906-NEXT: v_mov_b32_e32 v16, v7 +; GFX906-NEXT: v_mov_b32_e32 v11, v6 +; GFX906-NEXT: v_mov_b32_e32 v13, v5 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: .LBB10_4: ; %bb.3 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v13 -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v14 +; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v15 ; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v15 -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v16 +; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v16 +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v13 ; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_mov_b32_e32 v2, 0 ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] @@ -633,27 +659,31 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0 -; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[0:1], exec +; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[3:4], v6, s[4:5] +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, v3 ; GFX906-NEXT: v_mov_b32_e32 v2, v4 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB11_4 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB11_4 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[6:7] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB11_3 +; GFX906-NEXT: s_mov_b64 s[2:3], exec +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX906-NEXT: ; %bb.2: ; %bb.2 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9] -; GFX906-NEXT: .LBB11_3: ; %Flow ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: .LBB11_4: ; %bb.3 +; GFX906-NEXT: .LBB11_3: ; %Flow ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: .LBB11_4: ; %bb.3 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx2 v5, v[1:2], s[10:11] ; GFX906-NEXT: s_endpgm @@ -695,11 +725,11 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp ; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX906-NEXT: s_and_b64 s[6:7], exec, vcc ; GFX906-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; GFX906-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] ; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_cbranch_execnz .LBB12_1 +; GFX906-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GFX906-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -731,40 +761,44 @@ define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr ; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[12:13], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v1, s[4:5] +; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0 +; GFX906-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX906-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX906-NEXT: s_cbranch_execz .LBB13_2 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: s_movk_i32 s6, 0xff00 +; GFX906-NEXT: s_movk_i32 s4, 0xff00 ; GFX906-NEXT: v_mov_b32_e32 v5, 8 -; GFX906-NEXT: v_and_b32_sdwa v6, v1, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX906-NEXT: s_mov_b32 s6, 0x6070504 +; GFX906-NEXT: v_and_b32_sdwa v6, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX906-NEXT: s_mov_b32 s4, 0x6070504 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX906-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 ; GFX906-NEXT: v_lshlrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX906-NEXT: v_perm_b32 v7, v1, v1, s6 +; GFX906-NEXT: v_perm_b32 v7, v1, v1, s4 ; GFX906-NEXT: s_andn2_b64 s[2:3], s[2:3], exec -; GFX906-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v6, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX906-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX906-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX906-NEXT: v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dword v3, v1, s[8:9] ; GFX906-NEXT: global_store_dword v3, v7, s[8:9] offset:8 ; GFX906-NEXT: global_store_dword v3, v6, s[8:9] offset:16 ; GFX906-NEXT: global_store_dword v3, v4, s[8:9] offset:24 +; GFX906-NEXT: s_or_b64 exec, exec, s[12:13] ; GFX906-NEXT: .LBB13_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX906-NEXT: s_cbranch_execz .LBB13_4 +; GFX906-NEXT: s_mov_b64 s[4:5], exec +; GFX906-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX906-NEXT: s_cmov_b64 exec, s[2:3] +; GFX906-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX906-NEXT: ; %bb.3: ; %bb.2 ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v2 ; GFX906-NEXT: v_and_b32_e32 v4, 0xffffff00, v2 @@ -786,8 +820,8 @@ define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr ; GFX906-NEXT: global_store_dword v0, v4, s[10:11] offset:8 ; GFX906-NEXT: global_store_dword v0, v7, s[10:11] offset:16 ; GFX906-NEXT: global_store_dword v0, v2, s[10:11] offset:24 -; GFX906-NEXT: .LBB13_4: ; %bb.3 ; GFX906-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX906-NEXT: .LBB13_4: ; %bb.3 ; GFX906-NEXT: s_movk_i32 s3, 0xff00 ; GFX906-NEXT: v_mov_b32_e32 v4, 8 ; GFX906-NEXT: s_movk_i32 s2, 0xff diff --git a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll index d7db68a433319c..474e77a7102cad 100644 --- a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll @@ -55,12 +55,12 @@ define amdgpu_kernel void @foo(i1 %cmp1) { ; GFX906-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], v[5:6] ; GFX906-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX906-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX906-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 -; GFX906-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_cbranch_execnz .LBB0_1 +; GFX906-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; GFX906-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX906-NEXT: ; %bb.2: ; GFX906-NEXT: s_cmp_lg_u32 s5, 0 -; GFX906-NEXT: s_mov_b64 exec, s[2:3] ; GFX906-NEXT: s_cselect_b32 s5, 0x3ff00000, 0 ; GFX906-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX906-NEXT: s_mov_b32 s5, s4 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index c3a81771a2790c..f483bafce9138d 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -318,8 +318,9 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_mask_if: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 10, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB9_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; %if ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -331,8 +332,9 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1064-LABEL: test_mask_if: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 10, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB9_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; %if ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -362,20 +364,21 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_branch .LBB10_2 ; GFX1032-NEXT: .LBB10_1: ; %bb13 ; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfe, v4 ; GFX1032-NEXT: v_add_nc_u32_e32 v1, 1, v4 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execz .LBB10_8 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_8 ; GFX1032-NEXT: .LBB10_2: ; %bb2 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0 ; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0 +; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: s_mov_b32 s3, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB10_4 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX1032-NEXT: ; %bb.3: ; %bb5 ; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1032-NEXT: v_ashrrev_i32_e32 v2, 31, v1 @@ -390,27 +393,33 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v4 ; GFX1032-NEXT: s_and_b32 s6, vcc_lo, exec_lo ; GFX1032-NEXT: s_or_b32 s4, s4, s6 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: .LBB10_4: ; %Flow ; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr4 -; GFX1032-NEXT: s_and_saveexec_b32 s5, s4 -; GFX1032-NEXT: s_xor_b32 s4, exec_lo, s5 +; GFX1032-NEXT: s_cmov_b32 exec_lo, s4 +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_6 ; GFX1032-NEXT: ; %bb.5: ; %bb11 ; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1032-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo ; GFX1032-NEXT: v_add_nc_u32_e32 v4, v1, v4 ; GFX1032-NEXT: v_ashrrev_i32_e32 v4, 1, v4 -; GFX1032-NEXT: ; %bb.6: ; %Flow1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032-NEXT: .LBB10_6: ; %Flow1 ; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_and_saveexec_b32 s4, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB10_1 +; GFX1032-NEXT: s_mov_b32 s4, exec_lo +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cmov_b32 exec_lo, s3 +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_1 ; GFX1032-NEXT: ; %bb.7: ; %bb10 ; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1032-NEXT: v_mov_b32_e32 v4, v1 ; GFX1032-NEXT: global_store_dword v[2:3], v0, off +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_branch .LBB10_1 ; GFX1032-NEXT: .LBB10_8: ; %bb1 ; GFX1032-NEXT: s_endpgm @@ -424,20 +433,21 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: s_branch .LBB10_2 ; GFX1064-NEXT: .LBB10_1: ; %bb13 ; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0xfe, v4 ; GFX1064-NEXT: v_add_nc_u32_e32 v1, 1, v4 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB10_8 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_8 ; GFX1064-NEXT: .LBB10_2: ; %bb2 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_cmp_ge_i32_e64 s[6:7], v1, v0 ; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, v1, v0 +; GFX1064-NEXT: v_cmp_ge_i32_e64 s[6:7], v1, v0 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: s_mov_b64 s[4:5], 0 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB10_4 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX1064-NEXT: ; %bb.3: ; %bb5 ; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1064-NEXT: v_ashrrev_i32_e32 v2, 31, v1 @@ -452,27 +462,33 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 11, v4 ; GFX1064-NEXT: s_and_b64 s[10:11], vcc, exec ; GFX1064-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: .LBB10_4: ; %Flow ; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr4 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] -; GFX1064-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; GFX1064-NEXT: s_cmov_b64 exec, s[6:7] +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_6 ; GFX1064-NEXT: ; %bb.5: ; %bb11 ; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1064-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GFX1064-NEXT: v_add_nc_u32_e32 v4, v1, v4 ; GFX1064-NEXT: v_ashrrev_i32_e32 v4, 1, v4 -; GFX1064-NEXT: ; %bb.6: ; %Flow1 +; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: .LBB10_6: ; %Flow1 ; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX1064-NEXT: s_cbranch_execz .LBB10_1 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_1 ; GFX1064-NEXT: ; %bb.7: ; %bb10 ; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1064-NEXT: v_mov_b32_e32 v4, v1 ; GFX1064-NEXT: global_store_dword v[2:3], v0, off +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064-NEXT: s_branch .LBB10_1 ; GFX1064-NEXT: .LBB10_8: ; %bb1 ; GFX1064-NEXT: s_endpgm @@ -517,8 +533,9 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032: ; %bb.0: ; %bb ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB11_6 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB11_6 ; GFX1032-NEXT: ; %bb.1: ; %.preheader ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0 @@ -540,8 +557,9 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 ; GFX1032-NEXT: s_and_b32 s5, exec_lo, s3 ; GFX1032-NEXT: s_or_b32 s4, s5, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_execz .LBB11_6 +; GFX1032-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX1032-NEXT: s_cbranch_scc0 .LBB11_6 ; GFX1032-NEXT: .LBB11_4: ; %bb2 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -561,8 +579,9 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1064: ; %bb.0: ; %bb ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_mov_b32 s6, 0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB11_6 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB11_6 ; GFX1064-NEXT: ; %bb.1: ; %.preheader ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: v_min_u32_e32 v1, 0x100, v0 @@ -584,8 +603,9 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1064-NEXT: ; in Loop: Header=BB11_4 Depth=1 ; GFX1064-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GFX1064-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB11_6 +; GFX1064-NEXT: s_andn2_b64 s[8:9], exec, s[2:3] +; GFX1064-NEXT: s_cselect_b64 exec, s[8:9], s[2:3] +; GFX1064-NEXT: s_cbranch_scc0 .LBB11_6 ; GFX1064-NEXT: .LBB11_4: ; %bb2 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1268,20 +1288,22 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1032-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx3 v[1:3], v1, s[6:7] -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB22_2 +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_cmov_b32 exec_lo, s0 +; GFX1032-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; %bb ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: global_load_dword v0, v0, s[8:9] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_b32 vcc_lo, vcc_lo, exec_lo -; GFX1032-NEXT: .LBB22_2: ; %exit -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: .LBB22_2: ; %exit ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3 @@ -1292,23 +1314,25 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: s_mov_b64 vcc, 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx3 v[1:3], v1, s[6:7] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB22_2 +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_cmov_b64 exec, s[0:1] +; GFX1064-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; %bb ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: global_load_dword v0, v0, s[8:9] glc dlc +; GFX1064-NEXT: global_load_dword v0, v0, s[10:11] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_b64 vcc, vcc, exec +; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: .LBB22_2: ; %exit -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3 @@ -1544,8 +1568,9 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1032-NEXT: s_add_i32 s2, s2, 1 ; GFX1032-NEXT: s_and_b32 s3, exec_lo, s3 ; GFX1032-NEXT: s_or_b32 s0, s3, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB27_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s0 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s0 +; GFX1032-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX1032-NEXT: .LBB27_2: ; %bb1 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_or_b32 s1, s1, exec_lo @@ -1561,7 +1586,6 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1032-NEXT: s_or_b32 s1, s1, s3 ; GFX1032-NEXT: s_branch .LBB27_1 ; GFX1032-NEXT: .LBB27_4: ; %bb9 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 7 ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_write_b32 v0, v0 @@ -1582,8 +1606,9 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1064-NEXT: s_add_i32 s4, s4, 1 ; GFX1064-NEXT: s_and_b64 s[6:7], exec, s[6:7] ; GFX1064-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB27_4 +; GFX1064-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GFX1064-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GFX1064-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX1064-NEXT: .LBB27_2: ; %bb1 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_or_b64 s[2:3], s[2:3], exec @@ -1599,7 +1624,6 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1064-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX1064-NEXT: s_branch .LBB27_1 ; GFX1064-NEXT: .LBB27_4: ; %bb9 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v0, 7 ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_write_b32 v0, v0 @@ -1911,11 +1935,13 @@ define amdgpu_ps float @test_wwm2(i32 inreg %idx) { ; GFX1032-LABEL: test_wwm2: ; GFX1032: ; %bb.0: ; %main_body ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX1032-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB35_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB35_2 ; GFX1032-NEXT: ; %bb.1: ; %if ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -1925,18 +1951,20 @@ define amdgpu_ps float @test_wwm2(i32 inreg %idx) { ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mov_b32_e32 v0, v2 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1032-NEXT: .LBB35_2: ; %endif ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: .LBB35_2: ; %endif ; GFX1032-NEXT: ; return to shader part epilog ; ; GFX1064-LABEL: test_wwm2: ; GFX1064: ; %bb.0: ; %main_body ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX1064-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB35_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB35_2 ; GFX1064-NEXT: ; %bb.1: ; %if ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -1946,8 +1974,8 @@ define amdgpu_ps float @test_wwm2(i32 inreg %idx) { ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v0, v2 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1064-NEXT: .LBB35_2: ; %endif ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB35_2: ; %endif ; GFX1064-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -1998,11 +2026,13 @@ define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx) { ; GFX1032-LABEL: test_strict_wwm2: ; GFX1032: ; %bb.0: ; %main_body ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX1032-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB37_2 +; GFX1032-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB37_2 ; GFX1032-NEXT: ; %bb.1: ; %if ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -2012,18 +2042,20 @@ define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx) { ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mov_b32_e32 v0, v2 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1032-NEXT: .LBB37_2: ; %endif ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: .LBB37_2: ; %endif ; GFX1032-NEXT: ; return to shader part epilog ; ; GFX1064-LABEL: test_strict_wwm2: ; GFX1064: ; %bb.0: ; %main_body ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX1064-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB37_2 +; GFX1064-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB37_2 ; GFX1064-NEXT: ; %bb.1: ; %if ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -2033,8 +2065,8 @@ define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx) { ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v0, v2 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1064-NEXT: .LBB37_2: ; %endif ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB37_2: ; %endif ; GFX1064-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -2497,10 +2529,12 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: s_and_b32 s0, s0, exec_lo +; GFX1032-NEXT: s_cmov_b32 exec_lo, s0 +; GFX1032-NEXT: s_cbranch_scc0 .LBB50_2 ; GFX1032-NEXT: ; %bb.1: ; %if.then ; GFX1032-NEXT: ; divergent unreachable -; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1032-NEXT: .LBB50_2: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: icmp64: @@ -2531,10 +2565,12 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX1064-NEXT: s_cmov_b64 exec, s[0:1] +; GFX1064-NEXT: s_cbranch_scc0 .LBB50_2 ; GFX1064-NEXT: ; %bb.1: ; %if.then ; GFX1064-NEXT: ; divergent unreachable -; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1064-NEXT: .LBB50_2: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -2590,10 +2626,12 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: s_and_b32 s0, s0, exec_lo +; GFX1032-NEXT: s_cmov_b32 exec_lo, s0 +; GFX1032-NEXT: s_cbranch_scc0 .LBB51_2 ; GFX1032-NEXT: ; %bb.1: ; %if.then ; GFX1032-NEXT: ; divergent unreachable -; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1032-NEXT: .LBB51_2: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: fcmp64: @@ -2622,10 +2660,12 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX1064-NEXT: s_cmov_b64 exec, s[0:1] +; GFX1064-NEXT: s_cbranch_scc0 .LBB51_2 ; GFX1064-NEXT: ; %bb.1: ; %if.then ; GFX1064-NEXT: ; divergent unreachable -; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1064-NEXT: .LBB51_2: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -2684,10 +2724,12 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: s_and_b32 s0, s0, exec_lo +; GFX1032-NEXT: s_cmov_b32 exec_lo, s0 +; GFX1032-NEXT: s_cbranch_scc0 .LBB52_2 ; GFX1032-NEXT: ; %bb.1: ; %if.then ; GFX1032-NEXT: ; divergent unreachable -; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1032-NEXT: .LBB52_2: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: icmp32: @@ -2718,10 +2760,12 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX1064-NEXT: s_cmov_b64 exec, s[0:1] +; GFX1064-NEXT: s_cbranch_scc0 .LBB52_2 ; GFX1064-NEXT: ; %bb.1: ; %if.then ; GFX1064-NEXT: ; divergent unreachable -; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1064-NEXT: .LBB52_2: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -2776,10 +2820,12 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: s_and_b32 s0, s0, exec_lo +; GFX1032-NEXT: s_cmov_b32 exec_lo, s0 +; GFX1032-NEXT: s_cbranch_scc0 .LBB53_2 ; GFX1032-NEXT: ; %bb.1: ; %if.then ; GFX1032-NEXT: ; divergent unreachable -; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1032-NEXT: .LBB53_2: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: fcmp32: @@ -2808,10 +2854,12 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX1064-NEXT: s_cmov_b64 exec, s[0:1] +; GFX1064-NEXT: s_cbranch_scc0 .LBB53_2 ; GFX1064-NEXT: ; %bb.1: ; %if.then ; GFX1064-NEXT: ; divergent unreachable -; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1064-NEXT: .LBB53_2: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll index 46254994580d2d..38abc2014c1905 100644 --- a/llvm/test/CodeGen/AMDGPU/while-break.ll +++ b/llvm/test/CodeGen/AMDGPU/while-break.ll @@ -4,47 +4,55 @@ define amdgpu_ps float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 { ; GCN-LABEL: while_break: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s1, -1 -; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, -1 +; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_branch .LBB0_2 ; GCN-NEXT: .LBB0_1: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: s_and_b32 s2, exec_lo, s3 -; GCN-NEXT: s_or_b32 s0, s2, s0 -; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GCN-NEXT: s_cbranch_execz .LBB0_8 +; GCN-NEXT: s_and_b32 s2, exec_lo, s2 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_andn2_b32 s2, exec_lo, s1 +; GCN-NEXT: s_cselect_b32 exec_lo, s2, s1 +; GCN-NEXT: s_cbranch_scc0 .LBB0_8 ; GCN-NEXT: .LBB0_2: ; %header ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_add_i32 s1, s1, 1 +; GCN-NEXT: s_add_i32 s0, s0, 1 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: v_cmp_ge_i32_e32 vcc_lo, s1, v2 -; GCN-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GCN-NEXT: s_xor_b32 s3, exec_lo, s3 +; GCN-NEXT: v_cmp_ge_i32_e32 vcc_lo, s0, v2 +; GCN-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GCN-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %else ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v3 +; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v3 ; GCN-NEXT: s_and_b32 s2, vcc_lo, exec_lo -; GCN-NEXT: ; %bb.4: ; %Flow +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GCN-NEXT: .LBB0_4: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_andn2_saveexec_b32 s3, s3 +; GCN-NEXT: s_xor_b32 s4, s3, exec_lo +; GCN-NEXT: s_cmp_lg_u32 s3, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, s3 +; GCN-NEXT: s_cbranch_scc0 .LBB0_6 ; GCN-NEXT: ; %bb.5: ; %if ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_or_b32 s2, s2, exec_lo -; GCN-NEXT: ; %bb.6: ; %Flow1 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GCN-NEXT: .LBB0_6: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GCN-NEXT: s_mov_b32 s3, -1 -; GCN-NEXT: s_and_saveexec_b32 s4, s2 -; GCN-NEXT: s_cbranch_execz .LBB0_1 +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_and_b32 s4, s2, exec_lo +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, s4 +; GCN-NEXT: s_cbranch_scc0 .LBB0_1 ; GCN-NEXT: ; %bb.7: ; %latch ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v0 -; GCN-NEXT: s_orn2_b32 s3, vcc_lo, exec_lo +; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v0 +; GCN-NEXT: s_orn2_b32 s2, vcc_lo, exec_lo +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GCN-NEXT: s_branch .LBB0_1 ; GCN-NEXT: .LBB0_8: ; %end -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog entry: @@ -79,49 +87,57 @@ end: define amdgpu_ps float @while_break2(i32 %z, float %v, i32 %x, i32 %y) #0 { ; GCN-LABEL: while_break2: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s1, -1 -; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, -1 +; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_branch .LBB1_2 ; GCN-NEXT: .LBB1_1: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: s_and_b32 s2, exec_lo, s3 -; GCN-NEXT: s_or_b32 s0, s2, s0 -; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GCN-NEXT: s_cbranch_execz .LBB1_8 +; GCN-NEXT: s_and_b32 s2, exec_lo, s2 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_andn2_b32 s2, exec_lo, s1 +; GCN-NEXT: s_cselect_b32 exec_lo, s2, s1 +; GCN-NEXT: s_cbranch_scc0 .LBB1_8 ; GCN-NEXT: .LBB1_2: ; %header ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_add_i32 s1, s1, 1 +; GCN-NEXT: s_add_i32 s0, s0, 1 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: v_cmp_ge_i32_e32 vcc_lo, s1, v2 -; GCN-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GCN-NEXT: s_xor_b32 s3, exec_lo, s3 +; GCN-NEXT: v_cmp_ge_i32_e32 vcc_lo, s0, v2 +; GCN-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GCN-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB1_4 ; GCN-NEXT: ; %bb.3: ; %if ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_mov_b32 s2, exec_lo -; GCN-NEXT: ; %bb.4: ; %Flow +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GCN-NEXT: .LBB1_4: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_andn2_saveexec_b32 s3, s3 +; GCN-NEXT: s_xor_b32 s4, s3, exec_lo +; GCN-NEXT: s_cmp_lg_u32 s3, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, s3 +; GCN-NEXT: s_cbranch_scc0 .LBB1_6 ; GCN-NEXT: ; %bb.5: ; %else ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v3 +; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v3 ; GCN-NEXT: s_andn2_b32 s2, s2, exec_lo -; GCN-NEXT: s_and_b32 s4, vcc_lo, exec_lo -; GCN-NEXT: s_or_b32 s2, s2, s4 -; GCN-NEXT: ; %bb.6: ; %Flow1 +; GCN-NEXT: s_and_b32 s3, vcc_lo, exec_lo +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GCN-NEXT: .LBB1_6: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GCN-NEXT: s_mov_b32 s3, -1 -; GCN-NEXT: s_and_saveexec_b32 s4, s2 -; GCN-NEXT: s_cbranch_execz .LBB1_1 +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_and_b32 s4, s2, exec_lo +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, s4 +; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.7: ; %latch ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v0 -; GCN-NEXT: s_orn2_b32 s3, vcc_lo, exec_lo +; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v0 +; GCN-NEXT: s_orn2_b32 s2, vcc_lo, exec_lo +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GCN-NEXT: s_branch .LBB1_1 ; GCN-NEXT: .LBB1_8: ; %end -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog entry: @@ -162,47 +178,51 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i ; GCN-NEXT: s_branch .LBB2_2 ; GCN-NEXT: .LBB2_1: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GCN-NEXT: s_and_b32 s1, exec_lo, s4 +; GCN-NEXT: s_and_b32 s1, exec_lo, s3 ; GCN-NEXT: s_or_b32 s2, s1, s2 -; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GCN-NEXT: s_cbranch_execz .LBB2_6 +; GCN-NEXT: s_andn2_b32 s1, exec_lo, s2 +; GCN-NEXT: s_cselect_b32 exec_lo, s1, s2 +; GCN-NEXT: s_cbranch_scc0 .LBB2_6 ; GCN-NEXT: .LBB2_2: ; %header ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_cmp_ge_i32_e64 s3, s0, v1 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v1 -; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GCN-NEXT: s_cbranch_execz .LBB2_4 +; GCN-NEXT: v_cmp_ge_i32_e64 s4, s0, v1 +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB2_4 ; GCN-NEXT: ; %bb.3: ; %if ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: s_lshl_b64 s[6:7], s[0:1], 2 -; GCN-NEXT: s_andn2_b32 s1, s3, exec_lo +; GCN-NEXT: s_andn2_b32 s1, s4, exec_lo ; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v4, s6 ; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s7, v5, vcc_lo ; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v2 ; GCN-NEXT: global_load_dword v0, v[6:7], off -; GCN-NEXT: s_and_b32 s3, vcc_lo, exec_lo -; GCN-NEXT: s_or_b32 s3, s1, s3 +; GCN-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GCN-NEXT: s_or_b32 s4, s1, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f32_e32 v6, 1.0, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v6 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GCN-NEXT: .LBB2_4: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GCN-NEXT: v_mov_b32_e32 v7, v6 -; GCN-NEXT: s_mov_b32 s4, -1 -; GCN-NEXT: s_and_saveexec_b32 s1, s3 -; GCN-NEXT: s_cbranch_execz .LBB2_1 +; GCN-NEXT: s_mov_b32 s1, exec_lo +; GCN-NEXT: s_and_b32 s4, s4, exec_lo +; GCN-NEXT: s_mov_b32 s3, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, s4 +; GCN-NEXT: s_cbranch_scc0 .LBB2_1 ; GCN-NEXT: ; %bb.5: ; %latch ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v3 ; GCN-NEXT: v_mov_b32_e32 v7, v0 ; GCN-NEXT: s_add_i32 s0, s0, 1 -; GCN-NEXT: s_orn2_b32 s4, vcc_lo, exec_lo +; GCN-NEXT: s_orn2_b32 s3, vcc_lo, exec_lo +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GCN-NEXT: s_branch .LBB2_1 ; GCN-NEXT: .LBB2_6: ; %end -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GCN-NEXT: v_mov_b32_e32 v0, v7 ; GCN-NEXT: v_mov_b32_e32 v1, v6 ; GCN-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 6b4c2da772cdc2..4a0596822e18f9 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -505,9 +505,11 @@ define amdgpu_ps float @test_wwm3(i32 inreg %idx) { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 @@ -517,18 +519,20 @@ define amdgpu_ps float @test_wwm3(i32 inreg %idx) { ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-W64-NEXT: .LBB13_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB13_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_wwm3: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB13_2 +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 @@ -538,8 +542,8 @@ define amdgpu_ps float @test_wwm3(i32 inreg %idx) { ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-W32-NEXT: .LBB13_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB13_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -570,9 +574,11 @@ define amdgpu_ps float @test_wwm4(i32 inreg %idx) { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB14_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 @@ -581,18 +587,20 @@ define amdgpu_ps float @test_wwm4(i32 inreg %idx) { ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-W64-NEXT: .LBB14_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB14_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_wwm4: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB14_2 +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB14_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 @@ -601,8 +609,8 @@ define amdgpu_ps float @test_wwm4(i32 inreg %idx) { ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-W32-NEXT: .LBB14_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB14_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -685,16 +693,18 @@ main_body: define amdgpu_ps float @test_wwm6_then() { ; GFX9-W64-LABEL: test_wwm6_then: ; GFX9-W64: ; %bb.0: ; %main_body -; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc @@ -702,22 +712,24 @@ define amdgpu_ps float @test_wwm6_then() { ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-W64-NEXT: .LBB16_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: .LBB16_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_wwm6_then: ; GFX10-W32: ; %bb.0: ; %main_body -; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB16_2 +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc @@ -725,8 +737,8 @@ define amdgpu_ps float @test_wwm6_then() { ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-W32-NEXT: .LBB16_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: .LBB16_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, ptr addrspace(1) undef @@ -772,14 +784,14 @@ define amdgpu_ps float @test_wwm6_loop() { ; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-W64-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-W64-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-W64-NEXT: ; %bb.2: ; %endloop -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_wwm6_loop: @@ -804,10 +816,10 @@ define amdgpu_ps float @test_wwm6_loop() { ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-W32-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX10-W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX10-W32-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-W32-NEXT: ; %bb.2: ; %endloop -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, ptr addrspace(1) undef @@ -965,9 +977,11 @@ define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB21_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec @@ -978,18 +992,20 @@ define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) { ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-W64-NEXT: .LBB21_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB21_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wqm3: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB21_2 +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB21_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo @@ -1000,8 +1016,8 @@ define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) { ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-W32-NEXT: .LBB21_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB21_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -1032,9 +1048,11 @@ define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec @@ -1044,18 +1062,20 @@ define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) { ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-W64-NEXT: .LBB22_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB22_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wqm4: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB22_2 +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo @@ -1065,8 +1085,8 @@ define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) { ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-W32-NEXT: .LBB22_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB22_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -1153,16 +1173,18 @@ define amdgpu_ps float @test_strict_wqm6_then() { ; GFX9-W64-LABEL: test_strict_wqm6_then: ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec @@ -1171,23 +1193,25 @@ define amdgpu_ps float @test_strict_wqm6_then() { ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-W64-NEXT: .LBB24_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: .LBB24_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wqm6_then: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2 +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo @@ -1196,8 +1220,8 @@ define amdgpu_ps float @test_strict_wqm6_then() { ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-W32-NEXT: .LBB24_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: .LBB24_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, ptr addrspace(1) undef @@ -1246,14 +1270,14 @@ define amdgpu_ps float @test_strict_wqm6_loop() { ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-W64-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: s_cbranch_execnz .LBB25_1 +; GFX9-W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-W64-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX9-W64-NEXT: ; %bb.2: ; %endloop -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wqm6_loop: @@ -1281,10 +1305,10 @@ define amdgpu_ps float @test_strict_wqm6_loop() { ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-W32-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX10-W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX10-W32-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX10-W32-NEXT: ; %bb.2: ; %endloop -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, ptr addrspace(1) undef @@ -1365,23 +1389,27 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX9-W64-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-W64-NEXT: s_xor_b64 s[16:17], vcc, exec +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB27_2 ; GFX9-W64-NEXT: ; %bb.1: ; %ELSE -; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] +; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], s[12:13] ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen ; GFX9-W64-NEXT: ; implicit-def: $vgpr0 -; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15] +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[16:17] ; GFX9-W64-NEXT: .LBB27_2: ; %Flow -; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15] -; GFX9-W64-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-W64-NEXT: s_xor_b64 s[14:15], s[16:17], exec +; GFX9-W64-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, s[16:17] +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX9-W64-NEXT: ; %bb.3: ; %IF ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 -; GFX9-W64-NEXT: .LBB27_4: ; %END ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: .LBB27_4: ; %END ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 @@ -1391,24 +1419,28 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo -; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1 -; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 -; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2 +; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_xor_b32 s14, vcc_lo, exec_lo +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB27_2 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE -; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 +; GFX10-W32-NEXT: s_and_saveexec_b32 s13, s12 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen ; GFX10-W32-NEXT: ; implicit-def: $vgpr0 -; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s14 ; GFX10-W32-NEXT: .LBB27_2: ; %Flow -; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13 -; GFX10-W32-NEXT: s_cbranch_execz .LBB27_4 +; GFX10-W32-NEXT: s_xor_b32 s13, s14, exec_lo +; GFX10-W32-NEXT: s_cmp_lg_u32 s14, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, s14 +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX10-W32-NEXT: ; %bb.3: ; %IF ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX10-W32-NEXT: .LBB27_4: ; %END ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: .LBB27_4: ; %END ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 @@ -1441,25 +1473,27 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX9-W64-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-W64-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0 +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: .LBB28_2: ; %Flow -; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], s[14:15] ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] -; GFX9-W64-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: s_cbranch_execz .LBB28_4 +; GFX9-W64-NEXT: s_xor_b64 s[0:1], s[14:15], exec +; GFX9-W64-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, s[14:15] +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB28_4 ; GFX9-W64-NEXT: ; %bb.3: ; %ELSE ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen -; GFX9-W64-NEXT: .LBB28_4: ; %END ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: .LBB28_4: ; %END ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: ; return to shader part epilog @@ -1468,26 +1502,28 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo -; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1 -; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 -; GFX10-W32-NEXT: s_cbranch_execz .LBB28_2 +; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: ; implicit-def: $vgpr0 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: .LBB28_2: ; %Flow -; GFX10-W32-NEXT: s_or_saveexec_b32 s0, s13 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10-W32-NEXT: s_and_b32 s0, exec_lo, s0 -; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_cbranch_execz .LBB28_4 +; GFX10-W32-NEXT: s_xor_b32 s0, s13, exec_lo +; GFX10-W32-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, s13 +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB28_4 ; GFX10-W32-NEXT: ; %bb.3: ; %ELSE ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen -; GFX10-W32-NEXT: .LBB28_4: ; %END ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: .LBB28_4: ; %END ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: ; return to shader part epilog @@ -1527,18 +1563,24 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: s_waitcnt vmcnt(1) ; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; GFX9-W64-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX9-W64-NEXT: ; %bb.1: ; %ELSE ; GFX9-W64-NEXT: v_lshlrev_b32_e32 v0, 2, v5 ; GFX9-W64-NEXT: ; implicit-def: $vgpr5 -; GFX9-W64-NEXT: ; %bb.2: ; %Flow -; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15] +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: .LBB29_2: ; %Flow +; GFX9-W64-NEXT: s_xor_b64 s[16:17], s[14:15], exec +; GFX9-W64-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, s[14:15] +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX9-W64-NEXT: ; %bb.3: ; %IF ; GFX9-W64-NEXT: v_lshl_add_u32 v0, v5, 1, v5 -; GFX9-W64-NEXT: ; %bb.4: ; %END -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[16:17] +; GFX9-W64-NEXT: .LBB29_4: ; %END ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -1554,21 +1596,27 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3 ; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v0 +; GFX10-W32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen ; GFX10-W32-NEXT: ; implicit-def: $vgpr0 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE ; GFX10-W32-NEXT: v_lshlrev_b32_e32 v0, 2, v5 ; GFX10-W32-NEXT: ; implicit-def: $vgpr5 -; GFX10-W32-NEXT: ; %bb.2: ; %Flow -; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: .LBB29_2: ; %Flow +; GFX10-W32-NEXT: s_xor_b32 s14, s13, exec_lo +; GFX10-W32-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, s13 +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX10-W32-NEXT: ; %bb.3: ; %IF ; GFX10-W32-NEXT: v_lshl_add_u32 v0, v5, 1, v5 -; GFX10-W32-NEXT: ; %bb.4: ; %END -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s14 +; GFX10-W32-NEXT: .LBB29_4: ; %END ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -1618,28 +1666,26 @@ define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 ; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-W64-NEXT: s_cbranch_execnz .LBB30_3 -; GFX9-W64-NEXT: ; %bb.1: ; %Flow -; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9-W64-NEXT: s_cbranch_execnz .LBB30_4 -; GFX9-W64-NEXT: .LBB30_2: ; %END -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: s_branch .LBB30_5 -; GFX9-W64-NEXT: .LBB30_3: ; %ELSE +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB30_2 +; GFX9-W64-NEXT: ; %bb.1: ; %ELSE ; GFX9-W64-NEXT: v_mul_f32_e32 v0, 4.0, v1 ; GFX9-W64-NEXT: ; implicit-def: $vgpr1 -; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9-W64-NEXT: s_cbranch_execz .LBB30_2 -; GFX9-W64-NEXT: .LBB30_4: ; %IF -; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: .LBB30_2: ; %Flow +; GFX9-W64-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; GFX9-W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB30_4 +; GFX9-W64-NEXT: ; %bb.3: ; %IF +; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB30_4: ; %END ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: s_branch .LBB30_5 -; GFX9-W64-NEXT: .LBB30_5: +; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_control_flow_3: ; GFX10-W32: ; %bb.0: ; %main_body @@ -1650,28 +1696,27 @@ define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 ; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen -; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: ; implicit-def: $vgpr0 -; GFX10-W32-NEXT: v_cmpx_nlt_f32_e32 0, v1 -; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX10-W32-NEXT: s_cbranch_execnz .LBB30_3 -; GFX10-W32-NEXT: ; %bb.1: ; %Flow -; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0 -; GFX10-W32-NEXT: s_cbranch_execnz .LBB30_4 -; GFX10-W32-NEXT: .LBB30_2: ; %END -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_branch .LBB30_5 -; GFX10-W32-NEXT: .LBB30_3: ; %ELSE +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB30_2 +; GFX10-W32-NEXT: ; %bb.1: ; %ELSE ; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1 ; GFX10-W32-NEXT: ; implicit-def: $vgpr1 -; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0 -; GFX10-W32-NEXT: s_cbranch_execz .LBB30_2 -; GFX10-W32-NEXT: .LBB30_4: ; %IF -; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_branch .LBB30_5 -; GFX10-W32-NEXT: .LBB30_5: +; GFX10-W32-NEXT: .LBB30_2: ; %Flow +; GFX10-W32-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX10-W32-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB30_4 +; GFX10-W32-NEXT: ; %bb.3: ; %IF +; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB30_4: ; %END +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 @@ -1702,8 +1747,10 @@ define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i3 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB31_2 +; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB31_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF ; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] ; GFX9-W64-NEXT: buffer_load_dword v1, off, s[0:3], 0 @@ -1711,8 +1758,8 @@ define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i3 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-W64-NEXT: .LBB31_2: ; %END ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: .LBB31_2: ; %END ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -1724,9 +1771,11 @@ define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i3 ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo -; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 -; GFX10-W32-NEXT: s_cbranch_execz .LBB31_2 +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB31_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 ; GFX10-W32-NEXT: buffer_load_dword v1, off, s[0:3], 0 @@ -1734,8 +1783,8 @@ define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i3 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 -; GFX10-W32-NEXT: .LBB31_2: ; %END ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: .LBB31_2: ; %END ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2254,9 +2303,11 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB40_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -2273,8 +2324,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GFX9-W64-NEXT: .LBB40_2: ; %ENDIF ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: .LBB40_2: ; %ENDIF ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog @@ -2285,8 +2336,10 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB40_2 +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB40_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2303,8 +2356,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GFX10-W32-NEXT: .LBB40_2: ; %ENDIF ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: .LBB40_2: ; %ENDIF ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog @@ -2418,9 +2471,11 @@ define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB43_2 +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB43_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 @@ -2430,18 +2485,20 @@ define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-W64-NEXT: .LBB43_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB43_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wwm3: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB43_2 +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB43_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 @@ -2451,8 +2508,8 @@ define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-W32-NEXT: .LBB43_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB43_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -2483,9 +2540,11 @@ define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB44_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 @@ -2494,18 +2553,20 @@ define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-W64-NEXT: .LBB44_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB44_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wwm4: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB44_2 +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB44_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 @@ -2514,8 +2575,8 @@ define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-W32-NEXT: .LBB44_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB44_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -2598,16 +2659,18 @@ main_body: define amdgpu_ps float @test_strict_wwm6_then() { ; GFX9-W64-LABEL: test_strict_wwm6_then: ; GFX9-W64: ; %bb.0: ; %main_body -; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB46_2 +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB46_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc @@ -2615,22 +2678,24 @@ define amdgpu_ps float @test_strict_wwm6_then() { ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-W64-NEXT: .LBB46_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: .LBB46_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wwm6_then: ; GFX10-W32: ; %bb.0: ; %main_body -; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2 +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB46_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc @@ -2638,8 +2703,8 @@ define amdgpu_ps float @test_strict_wwm6_then() { ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-W32-NEXT: .LBB46_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: .LBB46_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, ptr addrspace(1) undef @@ -2681,14 +2746,14 @@ define amdgpu_ps float @test_strict_wwm6_loop() { ; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-W64-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: s_cbranch_execnz .LBB47_1 +; GFX9-W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-W64-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX9-W64-NEXT: ; %bb.2: ; %endloop -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wwm6_loop: @@ -2713,10 +2778,10 @@ define amdgpu_ps float @test_strict_wwm6_loop() { ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-W32-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX10-W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX10-W32-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX10-W32-NEXT: ; %bb.2: ; %endloop -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, ptr addrspace(1) undef @@ -2790,9 +2855,11 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB49_2 +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB49_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -2809,8 +2876,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GFX9-W64-NEXT: .LBB49_2: ; %ENDIF ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: .LBB49_2: ; %ENDIF ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog @@ -2821,8 +2888,10 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB49_2 +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB49_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2839,8 +2908,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GFX10-W32-NEXT: .LBB49_2: ; %ENDIF ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: .LBB49_2: ; %ENDIF ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog @@ -2877,9 +2946,11 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15] ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB50_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF ; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec @@ -2893,8 +2964,8 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-W64-NEXT: .LBB50_2: ; %ENDIF ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: .LBB50_2: ; %ENDIF ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-W64-NEXT: ; return to shader part epilog ; @@ -2906,10 +2977,12 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo -; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 -; GFX10-W32-NEXT: s_cbranch_execz .LBB50_2 +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB50_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo @@ -2923,8 +2996,8 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX10-W32-NEXT: .LBB50_2: ; %ENDIF ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: .LBB50_2: ; %ENDIF ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: @@ -3469,18 +3542,32 @@ define amdgpu_gs void @wqm_init_exec_switch(i32 %arg) { ; GFX9-W64: ; %bb.0: ; GFX9-W64-NEXT: s_mov_b64 exec, 0 ; GFX9-W64-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9-W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB57_2 +; GFX9-W64-NEXT: ; %bb.1: ; %LeafBlock1 +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: .LBB57_2: ; %Flow +; GFX9-W64-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; GFX9-W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, s[0:1] ; GFX9-W64-NEXT: s_endpgm ; ; GFX10-W32-LABEL: wqm_init_exec_switch: ; GFX10-W32: ; %bb.0: ; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo -; GFX10-W32-NEXT: v_cmpx_lt_i32_e32 0, v0 -; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0 +; GFX10-W32-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v0 +; GFX10-W32-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB57_2 +; GFX10-W32-NEXT: ; %bb.1: ; %LeafBlock1 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: .LBB57_2: ; %Flow +; GFX10-W32-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX10-W32-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, s0 ; GFX10-W32-NEXT: s_endpgm call void @llvm.amdgcn.init.exec(i64 0) switch i32 %arg, label %bb1 [ @@ -3548,8 +3635,10 @@ define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB59_2 +; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec +; GFX9-W64-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB59_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -3560,8 +3649,8 @@ define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] ; GFX9-W64-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-W64-NEXT: .LBB59_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: .LBB59_2: ; %endif ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x4 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -3578,8 +3667,10 @@ define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 -; GFX10-W32-NEXT: v_cmpx_gt_u32_e32 16, v0 -; GFX10-W32-NEXT: s_cbranch_execz .LBB59_2 +; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 +; GFX10-W32-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB59_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: global_load_dword v0, v[1:2], off ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -3590,8 +3681,8 @@ define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 -; GFX10-W32-NEXT: .LBB59_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: .LBB59_2: ; %endif ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index e79cb66dcd7760..a880dc79c8c5d4 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -150,89 +150,93 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] -; GFX9-O0-NEXT: s_mov_b32 s40, s6 +; GFX9-O0-NEXT: s_mov_b32 s36, s6 ; GFX9-O0-NEXT: s_mov_b32 s34, s4 -; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 -; GFX9-O0-NEXT: s_mov_b32 s41, s7 -; GFX9-O0-NEXT: s_mov_b32 s42, s41 -; GFX9-O0-NEXT: s_mov_b32 s43, s40 +; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37 +; GFX9-O0-NEXT: s_mov_b32 s37, s7 +; GFX9-O0-NEXT: s_mov_b32 s38, s37 +; GFX9-O0-NEXT: s_mov_b32 s39, s36 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s5 ; GFX9-O0-NEXT: s_mov_b32 s44, s35 -; GFX9-O0-NEXT: s_mov_b32 s36, s34 -; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s37, s44 -; GFX9-O0-NEXT: s_mov_b32 s38, s43 -; GFX9-O0-NEXT: s_mov_b32 s39, s42 +; GFX9-O0-NEXT: s_mov_b32 s40, s34 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43 +; GFX9-O0-NEXT: s_mov_b32 s41, s44 +; GFX9-O0-NEXT: s_mov_b32 s42, s39 +; GFX9-O0-NEXT: s_mov_b32 s43, s38 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v0, s36, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s37, 1 ; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 2 ; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 3 -; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: s_nop 2 -; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], s34 +; GFX9-O0-NEXT: s_mov_b32 s36, 0 +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[40:43], s36 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s36 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s36 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 -; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v3, s36 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s36 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5 +; GFX9-O0-NEXT: s_mov_b64 s[36:37], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s36, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s37, 5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] -; GFX9-O0-NEXT: s_and_b64 s[34:35], s[34:35], s[36:37] -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 -; GFX9-O0-NEXT: ; %bb.1: ; %if +; GFX9-O0-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[34:35] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9-O0-NEXT: s_branch .LBB1_2 +; GFX9-O0-NEXT: .LBB1_1: ; %if ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s34, v0, 4 +; GFX9-O0-NEXT: v_readlane_b32 s35, v0, 5 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-O0-NEXT: .LBB1_2: ; %merge ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s36, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s37, v0, 5 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[36:37] +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_readlane_b32 s38, v0, 0 ; GFX9-O0-NEXT: v_readlane_b32 s39, v0, 1 ; GFX9-O0-NEXT: v_readlane_b32 s34, v0, 2 ; GFX9-O0-NEXT: v_readlane_b32 s35, v0, 3 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, v4 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[36:37] @@ -267,23 +271,25 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; GFX9-O3-NEXT: s_mov_b64 s[34:35], exec ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc -; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O3-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-O3-NEXT: s_cmov_b64 exec, vcc +; GFX9-O3-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 @@ -297,9 +303,9 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: .LBB1_2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX9-O3-NEXT: .LBB1_2: ; %merge +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index a74dbe1de0d39e..76e6a25e78ba74 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -146,70 +146,74 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 2 ; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 3 ; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 4 -; GFX9-O0-NEXT: s_mov_b32 s0, 0 -; GFX9-O0-NEXT: s_nop 2 -; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], s0 +; GFX9-O0-NEXT: s_mov_b32 s2, 0 +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], s2 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 -; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v3, s0 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, s2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[0:1], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 6 +; GFX9-O0-NEXT: s_mov_b64 s[2:3], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 5 +; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 6 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 -; GFX9-O0-NEXT: ; %bb.1: ; %if +; GFX9-O0-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9-O0-NEXT: s_branch .LBB1_2 +; GFX9-O0-NEXT: .LBB1_1: ; %if ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 5 +; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 6 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-O0-NEXT: .LBB1_2: ; %merge ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 5 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 6 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_readlane_b32 s2, v0, 1 ; GFX9-O0-NEXT: v_readlane_b32 s3, v0, 2 ; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 3 ; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 4 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v4 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -233,23 +237,25 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-LABEL: cfg: ; GFX9-O3: ; %bb.0: ; %entry ; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0 +; GFX9-O3-NEXT: s_mov_b64 s[4:5], exec ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O3-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-O3-NEXT: s_cmov_b64 exec, vcc +; GFX9-O3-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 @@ -263,9 +269,9 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: .LBB1_2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX9-O3-NEXT: .LBB1_2: ; %merge +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0 @@ -1047,70 +1053,74 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 2 ; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 3 ; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 4 -; GFX9-O0-NEXT: s_mov_b32 s0, 0 -; GFX9-O0-NEXT: s_nop 2 -; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], s0 +; GFX9-O0-NEXT: s_mov_b32 s2, 0 +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], s2 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 -; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v3, s0 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, s2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[0:1], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 6 +; GFX9-O0-NEXT: s_mov_b64 s[2:3], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 5 +; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 6 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-O0-NEXT: s_cbranch_execz .LBB8_2 -; GFX9-O0-NEXT: ; %bb.1: ; %if +; GFX9-O0-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9-O0-NEXT: s_branch .LBB8_2 +; GFX9-O0-NEXT: .LBB8_1: ; %if ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 5 +; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 6 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-O0-NEXT: .LBB8_2: ; %merge ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 5 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 6 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_readlane_b32 s2, v0, 1 ; GFX9-O0-NEXT: v_readlane_b32 s3, v0, 2 ; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 3 ; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 4 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v4 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -1134,23 +1144,25 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-LABEL: strict_wwm_cfg: ; GFX9-O3: ; %bb.0: ; %entry ; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0 +; GFX9-O3-NEXT: s_mov_b64 s[4:5], exec ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-O3-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O3-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-O3-NEXT: s_cmov_b64 exec, vcc +; GFX9-O3-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 @@ -1164,9 +1176,9 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: .LBB8_2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX9-O3-NEXT: .LBB8_2: ; %merge +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0